pax_global_header00006660000000000000000000000064136676627630014540gustar00rootroot0000000000000052 comment=ffabce017386f2d02318aa323fa69e2e130078ed imbalanced-learn-0.7.0/000077500000000000000000000000001366766276300147225ustar00rootroot00000000000000imbalanced-learn-0.7.0/.circleci/000077500000000000000000000000001366766276300165555ustar00rootroot00000000000000imbalanced-learn-0.7.0/.circleci/config.yml000066400000000000000000000023401366766276300205440ustar00rootroot00000000000000version: 2 jobs: python3: docker: - image: circleci/python:3.6.1 environment: - USERNAME: "glemaitre" - ORGANIZATION: "imbalanced-learn" - DOC_REPO: "imbalanced-learn.github.io" - DOC_URL: "" - EMAIL: "g.lemaitre58@gmail.com" - MINICONDA_PATH: ~/miniconda - CONDA_ENV_NAME: testenv - PYTHON_VERSION: 3 steps: - add_ssh_keys: fingerprints: - "34:ea:b1:d9:b1:e2:5d:79:81:c4:d0:39:ca:85:e1:ef" - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - run: ./build_tools/circle/build_doc.sh - store_artifacts: path: doc/_build/html destination: doc - store_artifacts: path: ~/log.txt - persist_to_workspace: root: doc/_build/html paths: . - attach_workspace: at: doc/_build/html - run: ls -ltrh doc/_build/html - deploy: command: | if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then bash ./build_tools/circle/push_doc.sh doc/_build/html fi filters: branches: ignore: gh-pages workflows: version: 2 build-doc-and-deploy: jobs: - python3 imbalanced-learn-0.7.0/.coveragerc000066400000000000000000000005451366766276300170470ustar00rootroot00000000000000# Configuration for coverage.py [run] branch = True source = imblearn include = */imblearn/* omit = */setup.py [report] exclude_lines = pragma: no cover def __repr__ if self.debug: if settings.DEBUG raise AssertionError raise NotImplementedError if 0: if __name__ == .__main__.: if self.verbose: show_missing = Trueimbalanced-learn-0.7.0/.github/000077500000000000000000000000001366766276300162625ustar00rootroot00000000000000imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE.md000066400000000000000000000031711366766276300207710ustar00rootroot00000000000000 #### Description #### Steps/Code to Reproduce #### Expected Results #### Actual Results #### Versions imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001366766276300204455ustar00rootroot00000000000000imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000030641366766276300231420ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us reproduce and correct the bug title: "[BUG]" labels: bug assignees: '' --- #### Describe the bug A clear and concise description of what the bug is. #### Steps/Code to Reproduce ``` Sample code to reproduce the problem ``` #### Expected Results #### Actual Results #### Versions imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/documentation-improvement.md000066400000000000000000000006231366766276300262040ustar00rootroot00000000000000--- name: Documentation improvement about: Create a report to help us improve the documentation title: "[DOC]" labels: Documentation, help wanted, good first issue assignees: '' --- #### Describe the issue linked to the documentation Tell us about the confusion introduce in the documentation. #### Suggest a potential alternative/fix Tell us how we could improve the documentation in this regard. imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000010271366766276300241720ustar00rootroot00000000000000--- name: Feature request about: Suggest an new algorithm, enhancement to an existing algorithm, etc. title: "[ENH]" labels: enhancement assignees: '' --- <-- If you want to propose a new algorithm, please refer first to the scikit-learn inclusion criterion: https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms --> #### Is your feature request related to a problem? Please describe #### Describe the solution you'd like #### Describe alternatives you've considered #### Additional context imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/other--blank-template-.md000066400000000000000000000002011366766276300251310ustar00rootroot00000000000000--- name: Other (blank template) about: For all other issues to reach the community... title: '' labels: '' assignees: '' --- imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/question.md000066400000000000000000000003701366766276300226360ustar00rootroot00000000000000--- name: Question about: If you have a usage question title: '' labels: '' assignees: '' --- ** If your issue is a usage question, submit it here instead: - The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn ** imbalanced-learn-0.7.0/.github/ISSUE_TEMPLATE/usage-question.md000066400000000000000000000007371366766276300237470ustar00rootroot00000000000000--- name: Usage question about: If you have a usage question title: "[SO]" labels: question assignees: '' --- ** If your issue is a usage question, submit it here instead:** - **The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn** - **StackOverflow with the imblearn (or imbalanced-learn) tag:https://stackoverflow.com/questions/tagged/imblearn** We are going to automatically close this issue if this is not link to a bug or an enhancement. imbalanced-learn-0.7.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000014631366766276300220670ustar00rootroot00000000000000 #### Reference Issue #### What does this implement/fix? Explain your changes. #### Any other comments? imbalanced-learn-0.7.0/.gitignore000066400000000000000000000023511366766276300167130ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg Pipfile Pipfile.lock # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ # vim *.swp # emacs *~ # Visual Studio *.sln *.pyproj *.suo *.vs .vscode/ # PyCharm .idea/ # Cython *.pyc *.pyo __pycache__ *.so *.o *.egg *.egg-info Cython/Compiler/*.c Cython/Plex/*.c Cython/Runtime/refnanny.c Cython/Tempita/*.c Cython/*.c Tools/*.elc /TEST_TMP/ /build/ /wheelhouse*/ !tests/build/ /dist/ .gitrev .coverage *.orig *.rej *.dep *.swp *~ .ipynb_checkpoints docs/build tags TAGS MANIFEST .tox cythonize.dat # build documentation doc/_build/ doc/auto_examples/ doc/generated/ doc/bibtex/autoimbalanced-learn-0.7.0/.travis.yml000066400000000000000000000020701366766276300170320ustar00rootroot00000000000000# make it explicit that we favor the new container-based travis workers dist: bionic sudo: false language: python # Pre-install packages for the ubuntu distribution cache: apt: true # We use three different cache directory # to work around a Travis bug with multi-platform cache directories: - $HOME/.cache/pip - $HOME/.cache/pip - $HOME/download env: global: # Directory where tests are run from - TEST_DIR=/tmp/test_dir - MODULE=imblearn - OMP_NUM_THREADS=4 - OPENBLAS_NUM_THREADS=4 matrix: include: - env: PYTHON_VERSION="3.8" TEST_DOC="true" TEST_NUMPYDOC="true" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh after_success: source build_tools/travis/after_success.sh notifications: webhooks: urls: - https://webhooks.gitter.im/e/188e3c7a5180fd4f2120 on_success: always # options: [always|never|change] default: always on_failure: always # options: [always|never|change] default: always on_start: never # options: [always|never|change] default: always imbalanced-learn-0.7.0/AUTHORS.rst000066400000000000000000000010631366766276300166010ustar00rootroot00000000000000History ------- Development lead ~~~~~~~~~~~~~~~~ The project started in August 2014 by Fernando Nogueira and focused on SMOTE implementation. Together with Guillaume Lemaitre, Dayvid Victor, and Christos Aridas, additional under-sampling and over-sampling methods have been implemented as well as major changes in the API to be fully compatible with scikit-learn_. Contributors ------------ Refers to GitHub contributors page_. .. _scikit-learn: http://scikit-learn.org .. _page: https://github.com/scikit-learn-contrib/imbalanced-learn/graphs/contributors imbalanced-learn-0.7.0/CONTRIBUTING.md000066400000000000000000000155201366766276300171560ustar00rootroot00000000000000Contributing code ================= This guide is adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). How to contribute ----------------- The preferred way to contribute to imbalanced-learn is to fork the [main repository](https://github.com/scikit-learn-contrib/imbalanced-learn) on GitHub: 1. Fork the [project repository](https://github.com/scikit-learn-contrib/imbalanced-learn): click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. 2. Clone this copy to your local disk: $ git clone git@github.com:YourLogin/imbalanced-learn.git $ cd imblearn 3. Create a branch to hold your changes: $ git checkout -b my-feature and start making changes. Never work in the ``master`` branch! 4. Work on this copy on your computer using Git to do the version control. When you're done editing, do: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-feature Finally, go to the web page of your fork of the imbalanced-learn repo, and click 'Pull request' to send your changes to the maintainers for review. This will send an email to the committers. (If any of the above seems like magic to you, then look up the [Git documentation](https://git-scm.com/documentation) on the web.) Contributing Pull Requests -------------------------- It is recommended to check that your contribution complies with the following rules before submitting a pull request: - Follow the [coding-guidelines](http://scikit-learn.org/dev/developers/contributing.html#coding-guidelines) as for scikit-learn. - When applicable, use the validation tools and other code in the `sklearn.utils` submodule. A list of utility routines available for developers can be found in the [Utilities for Developers](http://scikit-learn.org/dev/developers/utilities.html#developers-utils) page. - If your pull request addresses an issue, please use the title to describe the issue and mention the issue number in the pull request description to ensure a link is created to the original issue. - All public methods should have informative docstrings with sample usage presented as doctests when appropriate. - Please prefix the title of your pull request with `[MRG]` if the contribution is complete and should be subjected to a detailed review. Incomplete contributions should be prefixed `[WIP]` to indicate a work in progress (and changed to `[MRG]` when it matures). WIPs may be useful to: indicate you are working on something to avoid duplicated work, request broad review of functionality or API, or seek collaborators. WIPs often benefit from the inclusion of a [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments) in the PR description. - All other tests pass when everything is rebuilt from scratch. On Unix-like systems, check with (from the toplevel source folder): $ make - When adding additional functionality, provide at least one example script in the ``examples/`` folder. Have a look at other examples for reference. Examples should demonstrate why the new functionality is useful in practice and, if possible, compare it to other methods available in scikit-learn. - Documentation and high-coverage tests are necessary for enhancements to be accepted. - At least one paragraph of narrative documentation with links to references in the literature (with PDF links when possible) and the example. You can also check for common programming errors with the following tools: - Code with good unittest coverage (at least 80%), check with: $ pip install pytest pytest-cov $ pytest --cov=imblearn imblearn - No pyflakes warnings, check with: $ pip install pyflakes $ pyflakes path/to/module.py - No PEP8 warnings, check with: $ pip install pycodestyle $ pycodestyle path/to/module.py - AutoPEP8 can help you fix some of the easy redundant errors: $ pip install autopep8 $ autopep8 path/to/pep8.py Filing bugs ----------- We use Github issues to track all bugs and feature requests; feel free to open an issue if you have found a bug or wish to see a feature implemented. It is recommended to check that your issue complies with the following rules before submitting: - Verify that your issue is not being currently addressed by other [issues](https://github.com/scikit-learn-contrib/imbalanced-learn/issues) or [pull requests](https://github.com/scikit-learn-contrib/imbalanced-learn/pulls). - Please ensure all code snippets and error messages are formatted in appropriate code blocks. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks). - Please include your operating system type and version number, as well as your Python, scikit-learn, numpy, and scipy versions. This information can be found by runnning the following code snippet: ```python import platform; print(platform.platform()) import sys; print("Python", sys.version) import numpy; print("NumPy", numpy.__version__) import scipy; print("SciPy", scipy.__version__) import sklearn; print("Scikit-Learn", sklearn.__version__) import imblearn; print("Imbalanced-Learn", imblearn.__version__) ``` - Please be specific about what estimators and/or functions are involved and the shape of the data, as appropriate; please include a [reproducible](https://stackoverflow.com/help/mcve) code snippet or link to a [gist](https://gist.github.com). If an exception is raised, please provide the traceback. Documentation ------------- We are glad to accept any sort of documentation: function docstrings, reStructuredText documents (like this one), tutorials, etc. reStructuredText documents live in the source code repository under the doc/ directory. You can edit the documentation using any text editor and then generate the HTML output by typing ``make html`` from the doc/ directory. Alternatively, ``make`` can be used to quickly generate the documentation without the example gallery. The resulting HTML files will be placed in _build/html/ and are viewable in a web browser. See the README file in the doc/ directory for more information. For building the documentation, you will need [sphinx](http://sphinx-doc.org), [matplotlib](https://matplotlib.org), and [pillow](https://pillow.readthedocs.io). When you are writing documentation, it is important to keep a good compromise between mathematical and algorithmic details, and give intuition to the reader on what the algorithm does. It is best to always start with a small paragraph with a hand-waving explanation of what the method does to the data and a figure (coming from an example) illustrating it. imbalanced-learn-0.7.0/LICENSE000066400000000000000000000022131366766276300157250ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2014 Fernando M. F. Nogueira, Guillaume Lemaitre, Dayvid Victor Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. imbalanced-learn-0.7.0/MANIFEST.in000066400000000000000000000002041366766276300164540ustar00rootroot00000000000000 recursive-include doc * recursive-include examples * include AUTHORS.rst include CONTRIBUTING.ms include LICENSE include README.rstimbalanced-learn-0.7.0/Makefile000066400000000000000000000012671366766276300163700ustar00rootroot00000000000000.PHONY: all clean test clean: find . -name "*.so" -o -name "*.pyc" -o -name "*.md5" -o -name "*.pyd" -o -name "*~" | xargs rm -f find . -name "*.pyx" -exec ./tools/rm_pyx_c_file.sh {} \; rm -rf coverage rm -rf dist rm -rf build rm -rf doc/_build rm -rf doc/auto_examples rm -rf doc/generated rm -rf doc/modules rm -rf examples/.ipynb_checkpoints test-code: pytest imblearn test-doc: pytest doc/*.rst test-coverage: rm -rf coverage .coverage pytest --cov=imblearn imblearn test: test-coverage test-doc html: export SPHINXOPTS=-W; make -C doc html conda: conda-build conda-recipe code-analysis: flake8 imblearn | grep -v __init__ pylint -E imblearn/ -d E1103,E0611,E1101 imbalanced-learn-0.7.0/README.rst000066400000000000000000000223511366766276300164140ustar00rootroot00000000000000.. -*- mode: rst -*- .. _scikit-learn: http://scikit-learn.org/stable/ .. _scikit-learn-contrib: https://github.com/scikit-learn-contrib |Azure|_ |Travis|_ |Codecov|_ |CircleCI|_ |PythonVersion|_ |Pypi|_ |Gitter|_ .. |Azure| image:: https://dev.azure.com/imbalanced-learn/imbalanced-learn/_apis/build/status/scikit-learn-contrib.imbalanced-learn?branchName=master .. _Azure: https://dev.azure.com/imbalanced-learn/imbalanced-learn/_build .. |Travis| image:: https://travis-ci.org/scikit-learn-contrib/imbalanced-learn.svg?branch=master .. _Travis: https://travis-ci.org/scikit-learn-contrib/imbalanced-learn .. |Codecov| image:: https://codecov.io/gh/scikit-learn-contrib/imbalanced-learn/branch/master/graph/badge.svg .. _Codecov: https://codecov.io/gh/scikit-learn-contrib/imbalanced-learn .. |CircleCI| image:: https://circleci.com/gh/scikit-learn-contrib/imbalanced-learn.svg?style=shield&circle-token=:circle-token .. _CircleCI: https://circleci.com/gh/scikit-learn-contrib/imbalanced-learn/tree/master .. |PythonVersion| image:: https://img.shields.io/pypi/pyversions/imbalanced-learn.svg .. _PythonVersion: https://img.shields.io/pypi/pyversions/imbalanced-learn.svg .. |Pypi| image:: https://badge.fury.io/py/imbalanced-learn.svg .. _Pypi: https://badge.fury.io/py/imbalanced-learn .. |Gitter| image:: https://badges.gitter.im/scikit-learn-contrib/imbalanced-learn.svg .. _Gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge imbalanced-learn ================ imbalanced-learn is a python package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance. It is compatible with scikit-learn_ and is part of scikit-learn-contrib_ projects. Documentation ------------- Installation documentation, API documentation, and examples can be found on the documentation_. .. _documentation: https://imbalanced-learn.org/stable/ Installation ------------ Dependencies ~~~~~~~~~~~~ imbalanced-learn is tested to work under Python 3.6+. The dependency requirements are based on the last scikit-learn release: * scipy(>=0.19.1) * numpy(>=1.13.3) * scikit-learn(>=0.22) * joblib(>=0.11) * keras 2 (optional) * tensorflow (optional) Additionally, to run the examples, you need matplotlib(>=2.0.0) and pandas(>=0.22). Installation ~~~~~~~~~~~~ imbalanced-learn is currently available on the PyPi's repository and you can install it via `pip`:: pip install -U imbalanced-learn The package is release also in Anaconda Cloud platform:: conda install -c conda-forge imbalanced-learn If you prefer, you can clone it and run the setup.py file. Use the following commands to get a copy from GitHub and install all dependencies:: git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git cd imbalanced-learn pip install . Or install using pip and GitHub:: pip install -U git+https://github.com/scikit-learn-contrib/imbalanced-learn.git Testing ~~~~~~~ After installation, you can use `pytest` to run the test suite:: make coverage Development ----------- The development of this scikit-learn-contrib is in line with the one of the scikit-learn community. Therefore, you can refer to their `Development Guide `_. About ----- If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:: @article{JMLR:v18:16-365, author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2017}, volume = {18}, number = {17}, pages = {1-5}, url = {http://jmlr.org/papers/v18/16-365} } Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common. One way of addressing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise. Re-sampling techniques are divided in two categories: 1. Under-sampling the majority class(es). 2. Over-sampling the minority class. 3. Combining over- and under-sampling. 4. Create ensemble balanced sets. Below is a list of the methods currently implemented in this module. * Under-sampling 1. Random majority under-sampling with replacement 2. Extraction of majority-minority Tomek links [1]_ 3. Under-sampling with Cluster Centroids 4. NearMiss-(1 & 2 & 3) [2]_ 5. Condensed Nearest Neighbour [3]_ 6. One-Sided Selection [4]_ 7. Neighboorhood Cleaning Rule [5]_ 8. Edited Nearest Neighbours [6]_ 9. Instance Hardness Threshold [7]_ 10. Repeated Edited Nearest Neighbours [14]_ 11. AllKNN [14]_ * Over-sampling 1. Random minority over-sampling with replacement 2. SMOTE - Synthetic Minority Over-sampling Technique [8]_ 3. SMOTENC - SMOTE for Nominal Continuous [8]_ 4. bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2 [9]_ 5. SVM SMOTE - Support Vectors SMOTE [10]_ 6. ADASYN - Adaptive synthetic sampling approach for imbalanced learning [15]_ 7. KMeans-SMOTE [17]_ * Over-sampling followed by under-sampling 1. SMOTE + Tomek links [12]_ 2. SMOTE + ENN [11]_ * Ensemble classifier using samplers internally 1. Easy Ensemble classifier [13]_ 2. Balanced Random Forest [16]_ 3. Balanced Bagging 4. RUSBoost [18]_ * Mini-batch resampling for Keras and Tensorflow The different algorithms are presented in the sphinx-gallery_. .. _sphinx-gallery: https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/index.html References: ----------- .. [1] : I. Tomek, “Two modifications of CNN,” IEEE Transactions on Systems, Man, and Cybernetics, vol. 6, pp. 769-772, 1976. .. [2] : I. Mani, J. Zhang. “kNN approach to unbalanced data distributions: A case study involving information extraction,” In Proceedings of the Workshop on Learning from Imbalanced Data Sets, pp. 1-7, 2003. .. [3] : P. E. Hart, “The condensed nearest neighbor rule,” IEEE Transactions on Information Theory, vol. 14(3), pp. 515-516, 1968. .. [4] : M. Kubat, S. Matwin, “Addressing the curse of imbalanced training sets: One-sided selection,” In Proceedings of the 14th International Conference on Machine Learning, vol. 97, pp. 179-186, 1997. .. [5] : J. Laurikkala, “Improving identification of difficult small classes by balancing class distribution,” Proceedings of the 8th Conference on Artificial Intelligence in Medicine in Europe, pp. 63-66, 2001. .. [6] : D. Wilson, “Asymptotic Properties of Nearest Neighbor Rules Using Edited Data,” IEEE Transactions on Systems, Man, and Cybernetrics, vol. 2(3), pp. 408-421, 1972. .. [7] : M. R. Smith, T. Martinez, C. Giraud-Carrier, “An instance level analysis of data complexity,” Machine learning, vol. 95(2), pp. 225-256, 2014. .. [8] : N. V. Chawla, K. W. Bowyer, L. O. Hall, W. P. Kegelmeyer, “SMOTE: Synthetic minority over-sampling technique,” Journal of Artificial Intelligence Research, vol. 16, pp. 321-357, 2002. .. [9] : H. Han, W.-Y. Wang, B.-H. Mao, “Borderline-SMOTE: A new over-sampling method in imbalanced data sets learning,” In Proceedings of the 1st International Conference on Intelligent Computing, pp. 878-887, 2005. .. [10] : H. M. Nguyen, E. W. Cooper, K. Kamei, “Borderline over-sampling for imbalanced data classification,” In Proceedings of the 5th International Workshop on computational Intelligence and Applications, pp. 24-29, 2009. .. [11] : G. E. A. P. A. Batista, R. C. Prati, M. C. Monard, “A study of the behavior of several methods for balancing machine learning training data,” ACM Sigkdd Explorations Newsletter, vol. 6(1), pp. 20-29, 2004. .. [12] : G. E. A. P. A. Batista, A. L. C. Bazzan, M. C. Monard, “Balancing training data for automated annotation of keywords: A case study,” In Proceedings of the 2nd Brazilian Workshop on Bioinformatics, pp. 10-18, 2003. .. [13] : X.-Y. Liu, J. Wu and Z.-H. Zhou, “Exploratory undersampling for class-imbalance learning,” IEEE Transactions on Systems, Man, and Cybernetics, vol. 39(2), pp. 539-550, 2009. .. [14] : I. Tomek, “An experiment with the edited nearest-neighbor rule,” IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, 1976. .. [15] : H. He, Y. Bai, E. A. Garcia, S. Li, “ADASYN: Adaptive synthetic sampling approach for imbalanced learning,” In Proceedings of the 5th IEEE International Joint Conference on Neural Networks, pp. 1322-1328, 2008. .. [16] : C. Chao, A. Liaw, and L. Breiman. "Using random forest to learn imbalanced data." University of California, Berkeley 110 (2004): 1-12. .. [17] : Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE" .. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197.imbalanced-learn-0.7.0/azure-pipelines.yml000066400000000000000000000056741366766276300205750ustar00rootroot00000000000000# Adapted from https://github.com/scikit-learn/scikit-learn/blob/master/azure-pipelines.yml jobs: - job: linting displayName: Linting pool: vmImage: ubuntu-18.04 steps: - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation - bash: conda create --name flake8_env --yes flake8 displayName: Install flake8 - bash: | source activate flake8_env ./build_tools/circle/linting.sh displayName: Run linting - template: build_tools/azure/posix.yml parameters: name: Linux vmImage: ubuntu-18.04 dependsOn: [linting] matrix: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 # i.e. numpy 1.13.3 and scipy 0.19 py36_ubuntu_atlas: DISTRIB: 'ubuntu' PYTHON_VERSION: '3.6' JOBLIB_VERSION: '*' # Linux environment to test the latest available dependencies and MKL. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '3.8' COVERAGE: 'true' PANDAS_VERSION: '*' TEST_DOCSTRINGS: 'true' JOBLIB_VERSION: '*' CHECK_WARNINGS: 'true' pylatest_conda_pandas_keras: DISTRIB: 'conda' PYTHON_VERSION: '3.7' INSTALL_MKL: 'true' PANDAS_VERSION: '*' KERAS_VERSION: '*' COVERAGE: 'true' JOBLIB_VERSION: '*' TEST_DOCSTRINGS: 'true' pylatest_conda_pandas_tensorflow: DISTRIB: 'conda' PYTHON_VERSION: '3.8' PANDAS_VERSION: '*' JOBLIB_VERSION: '*' INSTALL_MKL: 'true' TENSORFLOW_VERSION: '*' COVERAGE: 'true' TEST_DOCSTRINGS: 'true' - template: build_tools/azure/posix-32.yml parameters: name: Linux32 vmImage: ubuntu-18.04 dependsOn: [linting] matrix: py36_ubuntu_atlas_32bit: DISTRIB: 'ubuntu-32' PYTHON_VERSION: '3.6' JOBLIB_VERSION: '*' TEST_DOCSTRINGS: 'true' - template: build_tools/azure/posix.yml parameters: name: macOS vmImage: macOS-10.14 dependsOn: [linting] matrix: pylatest_conda_mkl: DISTRIB: 'conda' PYTHON_VERSION: '*' INSTALL_MKL: 'true' NUMPY_VERSION: '*' SCIPY_VERSION: '*' PANDAS_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' COVERAGE: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' - template: build_tools/azure/windows.yml parameters: name: Windows vmImage: vs2017-win2016 dependsOn: [linting] matrix: py37_conda_mkl: PYTHON_VERSION: '3.8' PYTHON_ARCH: '64' PYTEST_VERSION: '*' COVERAGE: 'true' CHECK_WARNINGS: 'true' py36_pip_openblas_32bit: PYTHON_VERSION: '3.6' PYTHON_ARCH: '32' imbalanced-learn-0.7.0/build_tools/000077500000000000000000000000001366766276300172415ustar00rootroot00000000000000imbalanced-learn-0.7.0/build_tools/azure/000077500000000000000000000000001366766276300203675ustar00rootroot00000000000000imbalanced-learn-0.7.0/build_tools/azure/install.cmd000066400000000000000000000023351366766276300225250ustar00rootroot00000000000000@rem https://github.com/numba/numba/blob/master/buildscripts/incremental/setup_conda_environment.cmd @rem The cmd /C hack circumvents a regression where conda installs a conda.bat @rem script in non-root environments. set CONDA_INSTALL=cmd /C conda install -q -y set PIP_INSTALL=pip install -q @echo on IF "%PYTHON_ARCH%"=="64" ( @rem Deactivate any environment call deactivate @rem Clean up any left-over from a previous build conda remove --all -q -y -n %VIRTUALENV% conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython wheel joblib git call activate %VIRTUALENV% IF "%PYTEST_VERSION%"=="*" ( pip install pytest ) else ( pip install pytest==%PYTEST_VERSION% ) pip install pytest-xdist ) else ( pip install numpy scipy cython pytest wheel pillow joblib ) if "%COVERAGE%" == "true" ( pip install coverage codecov pytest-cov ) python --version pip --version pip install scikit-learn @rem Install the build and runtime dependencies of the project. python setup.py bdist_wheel bdist_wininst @rem Install the generated wheel package to test it pip install --pre --no-index --find-links dist\ imbalanced-learn if %errorlevel% neq 0 exit /b %errorlevel% imbalanced-learn-0.7.0/build_tools/azure/install.sh000077500000000000000000000075301366766276300224010ustar00rootroot00000000000000#!/bin/bash set -e set -x UNAMESTR=`uname` make_conda() { TO_INSTALL="$@" conda create -n $VIRTUALENV --yes $TO_INSTALL source activate $VIRTUALENV } version_ge() { # The two version numbers are seperated with a new line is piped to sort # -rV. The -V activates for version number sorting and -r sorts in # decending order. If the first argument is the top element of the sort, it # is greater than or equal to the second argument. test "$(printf "${1}\n${2}" | sort -rV | head -n 1)" == "$1" } if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="python=$PYTHON_VERSION pip \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ joblib=$JOBLIB_VERSION git" if [[ "$INSTALL_MKL" == "true" ]]; then TO_INSTALL="$TO_INSTALL mkl" else TO_INSTALL="$TO_INSTALL nomkl" fi make_conda $TO_INSTALL python -m pip install scikit-learn TO_INSTALL="" if [[ -n "$PANDAS_VERSION" ]]; then TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" fi if [[ -n "$KERAS_VERSION" ]]; then TO_INSTALL="$TO_INSTALL keras=$KERAS_VERSION tensorflow=1" KERAS_BACKEND=tensorflow fi if [[ -n "$TENSORFLOW_VERSION" ]]; then TO_INSTALL="$TO_INSTALL tensorflow=$TENSORFLOW_VERSION" fi if [[ "$PYTEST_VERSION" == "*" ]]; then python -m pip install pytest else python -m pip install pytest=="$PYTEST_VERSION" fi if [[ "$PYTHON_VERSION" == "*" ]]; then python -m pip install pytest-xdist fi if [[ -n "$TO_INSTALL" ]]; then conda install --yes $TO_INSTALL fi if [[ -n "$KERAS_VERSION" ]]; then python -c "import keras.backend" sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; fi elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get install python3-scipy libatlas3-base libatlas-base-dev python3-virtualenv git python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate python -m pip install pytest==$PYTEST_VERSION pytest-cov joblib cython python -m pip install scikit-learn elif [[ "$DISTRIB" == "ubuntu-32" ]]; then apt-get update apt-get install -y python3-dev python3-scipy libatlas3-base libatlas-base-dev python3-virtualenv git python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate python -m pip install pytest==$PYTEST_VERSION pytest-cov joblib cython python -m pip install scikit-learn elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then # Since conda main channel usually lacks behind on the latest releases, # we use pypi to test against the latest releases of the dependencies. # conda is still used as a convenient way to install Python and pip. make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install numpy scipy joblib cython python -m pip install scikit-learn python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist python -m pip install pandas fi if [[ "$COVERAGE" == "true" ]]; then python -m pip install coverage codecov pytest-cov fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then python -m pip install sphinx python -m pip install -U git+https://github.com/numpy/numpydoc.git fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "\ try: import pandas print('pandas %s' % pandas.__version__) except ImportError: print('pandas not installed') " python -m pip list # Use setup.py instead of `pip install -e .` to be able to pass the -j flag # to speed-up the building multicore CI machines. python setup.py develop imbalanced-learn-0.7.0/build_tools/azure/posix-32.yml000066400000000000000000000035261366766276300225040ustar00rootroot00000000000000parameters: name: '' vmImage: '' matrix: [] dependsOn: [] jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '4' PYTEST_VERSION: '5.2.1' OPENBLAS_NUM_THREADS: '4' SKLEARN_SKIP_NETWORK_TESTS: '1' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: # Container is detached and sleeping, allowing steps to run commmands # in the container. The TEST_DIR is mapped allowing the host to access # the JUNITXML file - script: > docker container run --rm --volume $TEST_DIR:/temp_dir --volume $PWD:/io -w /io --detach --name skcontainer -e DISTRIB=ubuntu-32 -e TEST_DIR=/temp_dir -e JUNITXML=$JUNITXML -e VIRTUALENV=testvenv -e JOBLIB_VERSION=$JOBLIB_VERSION -e PYTEST_VERSION=$PYTEST_VERSION -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS i386/ubuntu:18.04 sleep 1000000 displayName: 'Start container' - script: > docker exec skcontainer ./build_tools/azure/install.sh displayName: 'Install' - script: > docker exec skcontainer ./build_tools/azure/test_script.sh displayName: 'Test Library' - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - script: > docker container stop skcontainer displayName: 'Stop container' condition: always() imbalanced-learn-0.7.0/build_tools/azure/posix.yml000066400000000000000000000027431366766276300222620ustar00rootroot00000000000000parameters: name: '' vmImage: '' matrix: [] dependsOn: [] jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' PYTEST_VERSION: '5.2.1' OMP_NUM_THREADS: '4' OPENBLAS_NUM_THREADS: '4' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH condition: startsWith(variables['DISTRIB'], 'conda') - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation condition: startsWith(variables['DISTRIB'], 'conda') - script: | build_tools/azure/install.sh displayName: 'Install' - script: | build_tools/azure/test_script.sh displayName: 'Test Library' - script: | build_tools/azure/test_docs.sh displayName: 'Test Docs' - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - script: | build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) imbalanced-learn-0.7.0/build_tools/azure/test_docs.sh000077500000000000000000000004321366766276300227140ustar00rootroot00000000000000#!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]]; then source $VIRTUALENV/bin/activate fi if [[ "$TEST_DOCSTRINGS" == 'true' ]]; then make test-doc pytest -vsl maint_tools/test_docstring.py fi imbalanced-learn-0.7.0/build_tools/azure/test_script.cmd000066400000000000000000000007411366766276300234210ustar00rootroot00000000000000@echo on @rem Only 64 bit uses conda and uses a python newer than 3.5 IF "%PYTHON_ARCH%"=="64" ( call activate %VIRTUALENV% set PYTEST_ARGS=%PYTEST_ARGS% -n2 ) mkdir %TMP_FOLDER% cd %TMP_FOLDER% if "%CHECK_WARNINGS%" == "true" ( set PYTEST_ARGS=%PYTEST_ARGS% -Werror::DeprecationWarning -Werror::FutureWarning ) if "%COVERAGE%" == "true" ( set PYTEST_ARGS=%PYTEST_ARGS% --cov imblearn ) pytest --junitxml=%JUNITXML% --showlocals --durations=20 %PYTEST_ARGS% --pyargs imblearn imbalanced-learn-0.7.0/build_tools/azure/test_script.sh000077500000000000000000000021501366766276300232670ustar00rootroot00000000000000#!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "ubuntu-32" ]]; then source $VIRTUALENV/bin/activate fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "\ try: import pandas print('pandas %s' % pandas.__version__) except ImportError: print('pandas not installed') " python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" TEST_CMD="$TEST_CMD --cov-config=$COVERAGE_PROCESS_START --cov imblearn" fi if [[ -n "$CHECK_WARNINGS" ]]; then TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning" fi if [[ "$PYTHON_VERSION" == "*" ]]; then TEST_CMD="$TEST_CMD -n2" fi mkdir -p $TEST_DIR cp setup.cfg $TEST_DIR cd $TEST_DIR set -x $TEST_CMD --pyargs imblearn set +x imbalanced-learn-0.7.0/build_tools/azure/upload_codecov.cmd000066400000000000000000000003301366766276300240360ustar00rootroot00000000000000@echo on @rem Only 64 bit uses conda IF "%PYTHON_ARCH%"=="64" ( call activate %VIRTUALENV% ) copy %TMP_FOLDER%\.coverage %BUILD_REPOSITORY_LOCALPATH% codecov --root %BUILD_REPOSITORY_LOCALPATH% -t %CODECOV_TOKEN% imbalanced-learn-0.7.0/build_tools/azure/upload_codecov.sh000077500000000000000000000006671366766276300237250ustar00rootroot00000000000000#!/bin/bash set -e # called when COVERAGE=="true" and DISTRIB=="conda" export PATH=$HOME/miniconda3/bin:$PATH source activate $VIRTUALENV # Need to run codecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run pushd $TEST_DIR coverage combine --append popd cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" imbalanced-learn-0.7.0/build_tools/azure/windows.yml000066400000000000000000000027671366766276300226200ustar00rootroot00000000000000 parameters: name: '' vmImage: '' matrix: [] dependsOn: [] jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} pool: vmImage: ${{ parameters.vmImage }} variables: VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' SKLEARN_SKIP_NETWORK_TESTS: '1' PYTEST_VERSION: '5.2.1' TMP_FOLDER: '$(Agent.WorkFolder)\tmp_folder' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" displayName: Add conda to PATH for 64 bit Python condition: eq(variables['PYTHON_ARCH'], '64') - task: UsePythonVersion@0 inputs: versionSpec: '$(PYTHON_VERSION)' addToPath: true architecture: 'x86' displayName: Use 32 bit System Python condition: eq(variables['PYTHON_ARCH'], '32') - script: | build_tools\\azure\\install.cmd displayName: 'Install' - script: | build_tools\\azure\\test_script.cmd displayName: 'Test Library' - script: | build_tools\\azure\\upload_codecov.cmd condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) - task: PublishTestResults@2 inputs: testResultsFiles: '$(TMP_FOLDER)\$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() imbalanced-learn-0.7.0/build_tools/circle/000077500000000000000000000000001366766276300205025ustar00rootroot00000000000000imbalanced-learn-0.7.0/build_tools/circle/build_doc.sh000077500000000000000000000071561366766276300227760ustar00rootroot00000000000000#!/usr/bin/env bash set -x set -e # Decide what kind of documentation build to run, and run it. # # If the last commit message has a "[doc skip]" marker, do not build # the doc. On the contrary if a "[doc build]" marker is found, build the doc # instead of relying on the subsequent rules. # # We always build the documentation for jobs that are not related to a specific # PR (e.g. a merge to master or a maintenance branch). # # If this is a PR, do a full build if there are some files in this PR that are # under the "doc/" or "examples/" folders, otherwise perform a quick build. # # If the inspection of the current commit fails for any reason, the default # behavior is to quick build the documentation. get_build_type() { if [ -z "$CIRCLE_SHA1" ] then echo SKIP: undefined CIRCLE_SHA1 return fi commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) if [ -z "$commit_msg" ] then echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 return fi if [[ "$commit_msg" =~ \[doc\ skip\] ]] then echo SKIP: [doc skip] marker found return fi if [[ "$commit_msg" =~ \[doc\ quick\] ]] then echo QUICK: [doc quick] marker found return fi if [[ "$commit_msg" =~ \[doc\ build\] ]] then echo BUILD: [doc build] marker found return fi if [ -z "$CI_PULL_REQUEST" ] then echo BUILD: not a pull request return fi git_range="origin/master...$CIRCLE_SHA1" git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) filenames=$(git diff --name-only $git_range) if [ -z "$filenames" ] then echo QUICK BUILD: no changed filenames for $git_range return fi if echo "$filenames" | grep -q -e ^examples/ then echo BUILD: detected examples/ filename modified in $git_range: $(echo "$filenames" | grep -e ^examples/ | head -n1) return fi echo QUICK BUILD: no examples/ filename modified in $git_range: echo "$filenames" } build_type=$(get_build_type) if [[ "$build_type" =~ ^SKIP ]] then exit 0 fi MAKE_TARGET=html # Installing required system packages to support the rendering of math # notation in the HTML documentation sudo -E apt-get -yq update sudo -E apt-get -yq remove texlive-binaries --purge sudo -E apt-get -yq --no-install-suggests --no-install-recommends \ install dvipng texlive-latex-base texlive-latex-extra \ texlive-latex-recommended texlive-fonts-recommended \ latexmk gsfonts ccache # deactivate circleci virtualenv and setup a miniconda env instead if [[ `type -t deactivate` ]]; then deactivate fi # Install dependencies with miniconda wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ -O miniconda.sh chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH="$MINICONDA_PATH/bin:$PATH" conda update --yes --quiet conda # Configure the conda environment and put it in the path using the # provided versions conda create -n $CONDA_ENV_NAME --yes --quiet python=3.7 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy joblib pillow matplotlib memory_profiler \ sphinx sphinx_rtd_theme \pandas keras tensorflow=1 pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git pip install -U git+https://github.com/numpy/numpydoc.git pip install -U git+https://github.com/mcmtroffaes/sphinxcontrib-bibtex.git # Build and install imbalanced-learn in dev mode ls -l pip install -e . # The pipefail is requested to propagate exit code set -o pipefail && cd doc && make $MAKE_TARGET 2>&1 | tee ~/log.txt cd - set +o pipefail imbalanced-learn-0.7.0/build_tools/circle/checkout_merge_commit.sh000077500000000000000000000016411366766276300253770ustar00rootroot00000000000000#!/bin/bash # Add `master` branch to the update list. # Otherwise CircleCI will give us a cached one. FETCH_REFS="+master:master" # Update PR refs for testing. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" fi # Retrieve the refs. git fetch -u origin ${FETCH_REFS} # Checkout the PR merge ref. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( echo Could not fetch merge commit. >&2 echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2; exit 1) fi # Check for merge conflicts. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git branch --merged | grep master > /dev/null git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null fi imbalanced-learn-0.7.0/build_tools/circle/linting.sh000077500000000000000000000142441366766276300225120ustar00rootroot00000000000000#!/bin/bash # This script is used in CircleCI to check that PRs do not add obvious # flake8 violations. It relies on two things: # - find common ancestor between branch and # scikit-learn/scikit-learn remote # - run flake8 --diff on the diff between the branch and the common # ancestor # # Additional features: # - the line numbers in Travis match the local branch on the PR # author machine. # - ./build_tools/circle/flake8_diff.sh can be run locally for quick # turn-around set -e # pipefail is necessary to propagate exit codes set -o pipefail PROJECT=scikit-learn-contrib/imbalanced-learn PROJECT_URL=https://github.com/$PROJECT.git # Find the remote with the project name (upstream in most cases) REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') # Add a temporary remote if needed. For example this is necessary when # Travis is configured to run in a fork. In this case 'origin' is the # fork and not the reference repo we want to diff against. if [[ -z "$REMOTE" ]]; then TMP_REMOTE=tmp_reference_upstream REMOTE=$TMP_REMOTE git remote add $REMOTE $PROJECT_URL fi echo "Remotes:" echo '--------------------------------------------------------------------------------' git remote --verbose # Travis does the git clone with a limited depth (50 at the time of # writing). This may not be enough to find the common ancestor with # $REMOTE/master so we unshallow the git checkout if [[ -a .git/shallow ]]; then echo -e '\nTrying to unshallow the repo:' echo '--------------------------------------------------------------------------------' git fetch --unshallow fi if [[ "$TRAVIS" == "true" ]]; then if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] then # In main repo, using TRAVIS_COMMIT_RANGE to test the commits # that were pushed into a branch if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then echo "New branch, no commit range from Travis so passing this test by convention" exit 0 fi COMMIT_RANGE=$TRAVIS_COMMIT_RANGE fi else # We want to fetch the code as it is in the PR branch and not # the result of the merge into master. This way line numbers # reported by Travis will match with the local code. LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST # In Travis the PR target is always origin git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF fi fi # If not using the commit range from Travis we need to find the common # ancestor between $LOCAL_BRANCH_REF and $REMOTE/master if [[ -z "$COMMIT_RANGE" ]]; then if [[ -z "$LOCAL_BRANCH_REF" ]]; then LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) fi echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" echo '--------------------------------------------------------------------------------' git --no-pager log -2 $LOCAL_BRANCH_REF REMOTE_MASTER_REF="$REMOTE/master" # Make sure that $REMOTE_MASTER_REF is a valid reference echo -e "\nFetching $REMOTE_MASTER_REF" echo '--------------------------------------------------------------------------------' git fetch $REMOTE master:refs/remotes/$REMOTE_MASTER_REF LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF) COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \ echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)" if [ -z "$COMMIT" ]; then exit 1 fi COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT) echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:" echo '--------------------------------------------------------------------------------' git --no-pager show --no-patch $COMMIT_SHORT_HASH COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" if [[ -n "$TMP_REMOTE" ]]; then git remote remove $TMP_REMOTE fi else echo "Got the commit range from Travis: $COMMIT_RANGE" fi echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" echo '--------------------------------------------------------------------------------' # We ignore files from sklearn/externals. Unfortunately there is no # way to do it with flake8 directly (the --exclude does not seem to # work with --diff). We could use the exclude magic in the git pathspec # ':!sklearn/externals' but it is only available on git 1.9 and Travis # uses git 1.8. # We need the following command to exit with 0 hence the echo in case # there is no match MODIFIED_FILES="$(git diff --name-only $COMMIT_RANGE | grep -v 'sklearn/externals' | \ grep -v 'doc/sphinxext' || echo "no_match")" check_files() { files="$1" shift options="$*" if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options fi } if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file outside sklearn/externals and doc/sphinxext has been modified" else check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)" check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \ --config ./examples/.flake8 fi echo -e "No problem detected by flake8\n" # For docstrings and warnings of deprecated attributes to be rendered # properly, the property decorator must come before the deprecated decorator # (else they are treated as functions) # do not error when grep -B1 "@property" finds nothing set +e bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` if [ ! -z "$bad_deprecation_property_order" ] then echo "property decorator should come before deprecated decorator" echo "found the following occurrencies:" echo $bad_deprecation_property_order exit 1 fi imbalanced-learn-0.7.0/build_tools/circle/push_doc.sh000077500000000000000000000024621366766276300226510ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called in the "deploy" step defined in # circle.yml. See https://circleci.com/docs/ for more details. # The behavior of the script is controlled by environment variable defined # in the circle.yml in the top level folder of the project. GENERATED_DOC_DIR=$1 if [[ -z "$GENERATED_DOC_DIR" ]]; then echo "Need to pass directory of the generated doc as argument" echo "Usage: $0 " exit 1 fi # Absolute path needed because we use cd further down in this script GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR) if [ "$CIRCLE_BRANCH" = "master" ] then dir=dev else # Strip off .X dir="${CIRCLE_BRANCH::-2}" fi MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" cd $HOME if [ ! -d $DOC_REPO ]; then git clone --depth 1 --no-checkout -b master "git@github.com:"$ORGANIZATION"/"$DOC_REPO".git"; fi cd $DOC_REPO git config core.sparseCheckout true echo $dir > .git/info/sparse-checkout git checkout master git reset --hard origin/master git rm -rf $dir/ && rm -rf $dir/ cp -R $GENERATED_DOC_DIR $dir touch $dir/.nojekyll git config --global user.email $EMAIL git config --global user.name $USERNAME git config --global push.default matching git add -f $dir/ git commit -m "$MSG" $dir git push origin master echo $MSG imbalanced-learn-0.7.0/build_tools/travis/000077500000000000000000000000001366766276300205515ustar00rootroot00000000000000imbalanced-learn-0.7.0/build_tools/travis/after_success.sh000077500000000000000000000010461366766276300237420ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called by the "after_success" step defined in # .travis.yml. See http://docs.travis-ci.com/ for more details. # License: 3-clause BSD set -e # Need to run covdecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run cd $TRAVIS_BUILD_DIR # Ignore covdecov failures as the covdecov server is not # very reliable but we don't want travis to report a failure # in the github UI just because the coverage report failed to # be published. codecov || echo "Covdecov upload failed" imbalanced-learn-0.7.0/build_tools/travis/install.sh000077500000000000000000000036461366766276300225670ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called by the "install" step defined in # .travis.yml. See http://docs.travis-ci.com/ for more details. # The behavior of the script is controlled by environment variabled defined # in the .travis.yml in the top level folder of the project. # License: 3-clause BSD # Travis clone pydicom/pydicom repository in to a local repository. set -e export CC=/usr/lib/ccache/gcc export CXX=/usr/lib/ccache/g++ # Useful for debugging how ccache is used # export CCACHE_LOGFILE=/tmp/ccache.log # ~60M is used by .ccache when compiling from scratch at the time of writing ccache --max-size 100M --show-stats # Deactivate the travis-provided virtual environment and setup a # conda-based environment instead deactivate # Install miniconda wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ -O miniconda.sh MINICONDA_PATH=/home/travis/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH # Configure the conda environment and put it in the path using the # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv pip install --upgrade pip setuptools echo "Installing numpy, scipy, and pandas master wheels" dev_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple pip install --pre --upgrade --timeout=60 -f $dev_url numpy scipy pandas echo "Installing joblib master" pip install https://github.com/joblib/joblib/archive/master.zip echo "Installing scikit-learn master" pip install --pre --extra-index $dev_url scikit-learn conda install --yes pytest pytest-cov pip install codecov pip install -U git+https://github.com/numpy/numpydoc.git python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" pip install -e . ccache --show-stats # Useful for debugging how ccache is used # cat $CCACHE_LOGFILE imbalanced-learn-0.7.0/build_tools/travis/test_script.sh000077500000000000000000000023621366766276300234560ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called by the "script" step defined in # .travis.yml. See http://docs.travis-ci.com/ for more details. # The behavior of the script is controlled by environment variabled defined # in the .travis.yml in the top level folder of the project. # License: 3-clause BSD set -e run_tests(){ # Get into a temp directory to run test from the installed scikit learn and # check if we do not leave artifacts mkdir -p $TEST_DIR # We need the setup.cfg for the pytest settings cp setup.cfg $TEST_DIR cd $TEST_DIR python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pytest --cov=$MODULE -r sx --pyargs $MODULE # Test doc cd $OLDPWD if [[ "$TEST_DOC" == "true" ]]; then make test-doc fi # Validate numpydoc style if [[ "$TEST_NUMPYDOC" == "true" ]]; then pytest -vsl maint_tools/test_docstring.py fi } if [[ "$SKIP_TESTS" != "true" ]]; then run_tests fi # Is directory still empty ? ls -ltra $TEST_DIR ls -ltra $TRAVIS_BUILD_DIR cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR imbalanced-learn-0.7.0/conftest.py000066400000000000000000000021621366766276300171220ustar00rootroot00000000000000# This file is here so that when running from the root folder # ./sklearn is added to sys.path by pytest. # See https://docs.pytest.org/en/latest/pythonpath.html for more details. # For example, this allows to build extensions in place and run pytest # doc/modules/clustering.rst and use sklearn from the local folder # rather than the one from site-packages. # Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make # the doctests pass import os import pytest import numpy as np try: np.set_printoptions(legacy='1.13') except TypeError: pass def pytest_runtest_setup(item): fname = item.fspath.strpath if (fname.endswith(os.path.join('keras', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: import keras except ImportError: pytest.skip('The keras package is not installed.') elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: import tensorflow except ImportError: pytest.skip('The tensorflow package is not installed.') imbalanced-learn-0.7.0/doc/000077500000000000000000000000001366766276300154675ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/Makefile000066400000000000000000000156371366766276300171430ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* -rm -rf auto_examples/ -rm -rf generated/* -rm -rf modules/generated/* html: # These two lines make the build a bit more lengthy, and the # the embedding of images more robust rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html touch $(BUILDDIR)/html/.nojekyll @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/imbalanced-learn.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/imbalanced-learn.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/imbalanced-learn" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/imbalanced-learn" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." imbalanced-learn-0.7.0/doc/_static/000077500000000000000000000000001366766276300171155ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/_static/css/000077500000000000000000000000001366766276300177055ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/_static/css/imbalanced-learn.css000066400000000000000000000005101366766276300235710ustar00rootroot00000000000000@import url("theme.css"); .highlight a { text-decoration: underline; } .deprecated p { padding: 10px 7px 10px 10px; color: #b94a48; background-color: #F3E5E5; border: 1px solid #eed3d7; } .deprecated p span.versionmodified { font-weight: bold; } .wy-nav-content { max-width: 1200px !important; } imbalanced-learn-0.7.0/doc/_static/js/000077500000000000000000000000001366766276300175315ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/_static/js/copybutton.js000066400000000000000000000053631366766276300223040ustar00rootroot00000000000000$(document).ready(function() { /* Add a [>>>] button on the top-right corner of code samples to hide * the >>> and ... prompts and the output and thus make the code * copyable. */ var div = $('.highlight-python .highlight,' + '.highlight-python3 .highlight,' + '.highlight-pycon .highlight,' + '.highlight-default .highlight') var pre = div.find('pre'); // get the styles from the current theme pre.parent().parent().css('position', 'relative'); var hide_text = 'Hide the prompts and output'; var show_text = 'Show the prompts and output'; var border_width = pre.css('border-top-width'); var border_style = pre.css('border-top-style'); var border_color = pre.css('border-top-color'); var button_styles = { 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', 'border-color': border_color, 'border-style': border_style, 'border-width': border_width, 'color': border_color, 'text-size': '75%', 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em', 'border-radius': '0 3px 0 0' } // create and add the button to all the code blocks that contain >>> div.each(function(index) { var jthis = $(this); if (jthis.find('.gp').length > 0) { var button = $('>>>'); button.css(button_styles) button.attr('title', hide_text); button.data('hidden', 'false'); jthis.prepend(button); } // tracebacks (.gt) contain bare text elements that need to be // wrapped in a span to work with .nextUntil() (see later) jthis.find('pre:has(.gt)').contents().filter(function() { return ((this.nodeType == 3) && (this.data.trim().length > 0)); }).wrap(''); }); // define the behavior of the button when it's clicked $('.copybutton').click(function(e){ e.preventDefault(); var button = $(this); if (button.data('hidden') === 'false') { // hide the code output button.parent().find('.go, .gp, .gt').hide(); button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); button.css('text-decoration', 'line-through'); button.attr('title', show_text); button.data('hidden', 'true'); } else { // show the code output button.parent().find('.go, .gp, .gt').show(); button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); button.css('text-decoration', 'none'); button.attr('title', hide_text); button.data('hidden', 'false'); } }); }); imbalanced-learn-0.7.0/doc/_templates/000077500000000000000000000000001366766276300176245ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/_templates/breadcrumbs.html000066400000000000000000000001401366766276300227760ustar00rootroot00000000000000{%- extends "sphinx_rtd_theme/breadcrumbs.html" %} {% block breadcrumbs_aside %} {% endblock %}imbalanced-learn-0.7.0/doc/_templates/class.rst000066400000000000000000000004421366766276300214630ustar00rootroot00000000000000:mod:`{{module}}`.{{objname}} {{ underline }}============== .. currentmodule:: {{ module }} .. autoclass:: {{ objname }} {% block methods %} .. automethod:: __init__ {% endblock %} .. include:: {{module}}.{{objname}}.examples .. raw:: html
imbalanced-learn-0.7.0/doc/_templates/function.rst000066400000000000000000000003451366766276300222050ustar00rootroot00000000000000:mod:`{{module}}`.{{objname}} {{ underline }}==================== .. currentmodule:: {{ module }} .. autofunction:: {{ objname }} .. include:: {{module}}.{{objname}}.examples .. raw:: html
imbalanced-learn-0.7.0/doc/_templates/numpydoc_docstring.py000066400000000000000000000003261366766276300241110ustar00rootroot00000000000000{{index}} {{summary}} {{extended_summary}} {{parameters}} {{returns}} {{yields}} {{other_parameters}} {{attributes}} {{raises}} {{warns}} {{warnings}} {{see_also}} {{notes}} {{references}} {{examples}} {{methods}} imbalanced-learn-0.7.0/doc/about.rst000066400000000000000000000012131366766276300173300ustar00rootroot00000000000000About us ======== .. include:: ../AUTHORS.rst .. _citing-imbalanced-learn: Citing imbalanced-learn ----------------------- If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:: @article{JMLR:v18:16-365, author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2017}, volume = {18}, number = {17}, pages = {1-5}, url = {http://jmlr.org/papers/v18/16-365.html} } imbalanced-learn-0.7.0/doc/api.rst000066400000000000000000000120371366766276300167750ustar00rootroot00000000000000###################### imbalanced-learn API ###################### This is the full API documentation of the `imbalanced-learn` toolbox. .. _under_sampling_ref: :mod:`imblearn.under_sampling`: Under-sampling methods ====================================================== .. automodule:: imblearn.under_sampling :no-members: :no-inherited-members: .. currentmodule:: imblearn Prototype generation -------------------- .. automodule:: imblearn.under_sampling._prototype_generation :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst under_sampling.ClusterCentroids Prototype selection ------------------- .. automodule:: imblearn.under_sampling._prototype_selection :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst under_sampling.CondensedNearestNeighbour under_sampling.EditedNearestNeighbours under_sampling.RepeatedEditedNearestNeighbours under_sampling.AllKNN under_sampling.InstanceHardnessThreshold under_sampling.NearMiss under_sampling.NeighbourhoodCleaningRule under_sampling.OneSidedSelection under_sampling.RandomUnderSampler under_sampling.TomekLinks .. _over_sampling_ref: :mod:`imblearn.over_sampling`: Over-sampling methods ==================================================== .. automodule:: imblearn.over_sampling :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst over_sampling.ADASYN over_sampling.BorderlineSMOTE over_sampling.KMeansSMOTE over_sampling.RandomOverSampler over_sampling.SMOTE over_sampling.SMOTENC over_sampling.SVMSMOTE .. _combine_ref: :mod:`imblearn.combine`: Combination of over- and under-sampling methods ======================================================================== .. automodule:: imblearn.combine :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst combine.SMOTEENN combine.SMOTETomek .. _ensemble_ref: :mod:`imblearn.ensemble`: Ensemble methods ========================================== .. automodule:: imblearn.ensemble :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst ensemble.BalancedBaggingClassifier ensemble.BalancedRandomForestClassifier ensemble.EasyEnsembleClassifier ensemble.RUSBoostClassifier .. _keras_ref: :mod:`imblearn.keras`: Batch generator for Keras ================================================ .. automodule:: imblearn.keras :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst keras.BalancedBatchGenerator .. autosummary:: :toctree: generated/ :template: function.rst keras.balanced_batch_generator .. _tensorflow_ref: :mod:`imblearn.tensorflow`: Batch generator for TensorFlow ========================================================== .. automodule:: imblearn.tensorflow :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: function.rst tensorflow.balanced_batch_generator .. _misc_ref: Miscellaneous ============= Imbalance-learn provides some fast-prototyping tools. .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst FunctionSampler .. _pipeline_ref: :mod:`imblearn.pipeline`: Pipeline ================================== .. automodule:: imblearn.pipeline :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst pipeline.Pipeline .. autosummary:: :toctree: generated/ :template: function.rst pipeline.make_pipeline .. _metrics_ref: :mod:`imblearn.metrics`: Metrics ================================ .. automodule:: imblearn.metrics :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: function.rst metrics.classification_report_imbalanced metrics.sensitivity_specificity_support metrics.sensitivity_score metrics.specificity_score metrics.geometric_mean_score metrics.make_index_balanced_accuracy .. _datasets_ref: :mod:`imblearn.datasets`: Datasets ================================== .. automodule:: imblearn.datasets :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: function.rst datasets.make_imbalance datasets.fetch_datasets :mod:`imblearn.utils`: Utilities ================================ .. automodule:: imblearn.utils :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: function.rst utils.estimator_checks.parametrize_with_checks utils.check_neighbors_object utils.check_sampling_strategy imbalanced-learn-0.7.0/doc/bibtex/000077500000000000000000000000001366766276300167445ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/bibtex/refs.bib000066400000000000000000000135251366766276300203670ustar00rootroot00000000000000@article{batista2004study, title={A study of the behavior of several methods for balancing machine learning training data}, author={Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina}, journal={ACM SIGKDD explorations newsletter}, volume={6}, number={1}, pages={20--29}, year={2004}, publisher={ACM} } @inproceedings{batista2003balancing, title={Balancing Training Data for Automated Annotation of Keywords: a Case Study.}, author={Batista, Gustavo EAPA and Bazzan, Ana LC and Monard, Maria Carolina}, booktitle={WOB}, pages={10--18}, year={2003} } @article{chen2004using, title={Using random forest to learn imbalanced data}, author={Chen, Chao and Liaw, Andy and Breiman, Leo and others}, journal={University of California, Berkeley}, volume={110}, number={1-12}, pages={24}, year={2004} } @article{liu2008exploratory, title={Exploratory undersampling for class-imbalance learning}, author={Liu, Xu-Ying and Wu, Jianxin and Zhou, Zhi-Hua}, journal={IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)}, volume={39}, number={2}, pages={539--550}, year={2008}, publisher={IEEE} } @article{seiffert2009rusboost, title={RUSBoost: A hybrid approach to alleviating class imbalance}, author={Seiffert, Chris and Khoshgoftaar, Taghi M and Van Hulse, Jason and Napolitano, Amri}, journal={IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans}, volume={40}, number={1}, pages={185--197}, year={2009}, publisher={IEEE} } @inproceedings{kubat1997addressing, title={Addressing the curse of imbalanced training sets: one-sided selection}, author={Kubat, Miroslav and Matwin, Stan and others}, booktitle={Icml}, volume={97}, pages={179--186}, year={1997}, organization={Nashville, USA} } @article{barandela2003strategies, title={Strategies for learning in class imbalance problems}, author={Barandela, Ricardo and S{\'a}nchez, Jos{\'e} Salvador and Garca, V and Rangel, Edgar}, journal={Pattern Recognition}, volume={36}, number={3}, pages={849--851}, year={2003}, publisher={Elsevier Science Publishing Company, Inc.} } @article{garcia2012effectiveness, title={On the effectiveness of preprocessing methods when dealing with different levels of class imbalance}, author={Garc{\'\i}a, Vicente and S{\'a}nchez, Jos{\'e} Salvador and Mollineda, Ram{\'o}n Alberto}, journal={Knowledge-Based Systems}, volume={25}, number={1}, pages={13--21}, year={2012}, publisher={Elsevier} } @inproceedings{he2008adasyn, title={ADASYN: Adaptive synthetic sampling approach for imbalanced learning}, author={He, Haibo and Bai, Yang and Garcia, Edwardo A and Li, Shutao}, booktitle={2008 IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence)}, pages={1322--1328}, year={2008}, organization={IEEE} } @article{chawla2002smote, title={SMOTE: synthetic minority over-sampling technique}, author={Chawla, Nitesh V and Bowyer, Kevin W and Hall, Lawrence O and Kegelmeyer, W Philip}, journal={Journal of artificial intelligence research}, volume={16}, pages={321--357}, year={2002} } @inproceedings{han2005borderline, title={Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning}, author={Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan}, booktitle={International conference on intelligent computing}, pages={878--887}, year={2005}, organization={Springer} } @inproceedings{nguyen2009borderline, title={Borderline over-sampling for imbalanced data classification}, author={Nguyen, Hien M and Cooper, Eric W and Kamei, Katsuari}, booktitle={Proceedings: Fifth International Workshop on Computational Intelligence \& Applications}, volume={2009}, number={1}, pages={24--29}, year={2009}, organization={IEEE SMC Hiroshima Chapter} } @article{last2017oversampling, title={Oversampling for Imbalanced Learning Based on K-Means and SMOTE}, author={Last, Felix and Douzas, Georgios and Bacao, Fernando}, journal={arXiv preprint arXiv:1711.00837}, year={2017} } @inproceedings{mani2003knn, title={kNN approach to unbalanced data distributions: a case study involving information extraction}, author={Mani, Inderjeet and Zhang, I}, booktitle={Proceedings of workshop on learning from imbalanced datasets}, volume={126}, year={2003} } @article{tomek1976two, title={Two modifications of CNN}, author={Tomek, Ivan}, journal={IEEE Trans. Systems, Man and Cybernetics}, volume={6}, pages={769--772}, year={1976} } @article{wilson1972asymptotic, title={Asymptotic properties of nearest neighbor rules using edited data}, author={Wilson, Dennis L}, journal={IEEE Transactions on Systems, Man, and Cybernetics}, number={3}, pages={408--421}, year={1972}, publisher={IEEE} } @article{tomek1976experiment, title={An experiment with the edited nearest-neighbor rule}, author={Tomek, Ivan}, journal={IEEE Transactions on systems, Man, and Cybernetics}, volume={6}, number={6}, pages={448--452}, year={1976} } @article{hart1968condensed, title={The condensed nearest neighbor rule (Corresp.)}, author={Hart, Peter}, journal={IEEE transactions on information theory}, volume={14}, number={3}, pages={515--516}, year={1968}, publisher={Citeseer} } @inproceedings{laurikkala2001improving, title={Improving identification of difficult small classes by balancing class distribution}, author={Laurikkala, Jorma}, booktitle={Conference on Artificial Intelligence in Medicine in Europe}, pages={63--66}, year={2001}, organization={Springer} } @article{smith2014instance, title={An instance level analysis of data complexity}, author={Smith, Michael R and Martinez, Tony and Giraud-Carrier, Christophe}, journal={Machine learning}, volume={95}, number={2}, pages={225--256}, year={2014}, publisher={Springer} } imbalanced-learn-0.7.0/doc/combine.rst000066400000000000000000000044411366766276300176400ustar00rootroot00000000000000.. _combine: ======================================= Combination of over- and under-sampling ======================================= .. currentmodule:: imblearn.over_sampling We previously presented :class:`SMOTE` and showed that this method can generate noisy samples by interpolating new points between marginal outliers and inliers. This issue can be solved by cleaning the space resulting from over-sampling. .. currentmodule:: imblearn.combine In this regard, Tomek's link and edited nearest-neighbours are the two cleaning methods that have been added to the pipeline after applying SMOTE over-sampling to obtain a cleaner space. The two ready-to use classes imbalanced-learn implements for combining over- and undersampling methods are: (i) :class:`SMOTETomek` :cite:`batista2004study` and (ii) :class:`SMOTEENN` :cite:`batista2003balancing`. Those two classes can be used like any other sampler with parameters identical to their former samplers:: >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> print(sorted(Counter(y).items())) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.combine import SMOTEENN >>> smote_enn = SMOTEENN(random_state=0) >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4060), (1, 4381), (2, 3502)] >>> from imblearn.combine import SMOTETomek >>> smote_tomek = SMOTETomek(random_state=0) >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4499), (1, 4566), (2, 4413)] We can also see in the example below that :class:`SMOTEENN` tends to clean more noisy samples than :class:`SMOTETomek`. .. image:: ./auto_examples/combine/images/sphx_glr_plot_comparison_combine_001.png :target: ./auto_examples/combine/plot_comparison_combine.html :scale: 60 :align: center .. topic:: Examples * :ref:`sphx_glr_auto_examples_combine_plot_comparison_combine.py` imbalanced-learn-0.7.0/doc/conf.py000066400000000000000000000264451366766276300170010ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # imbalanced-learn documentation build configuration file, created by # sphinx-quickstart on Mon Jan 18 14:44:12 2016. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import os import sys import sphinx_rtd_theme # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('sphinxext')) from github_link import make_linkcode_resolve import sphinx_gallery # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.linkcode', 'sphinxcontrib.bibtex', 'numpydoc', 'sphinx_issues', 'sphinx_gallery.gen_gallery', ] # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False extensions.append('sphinx.ext.imgmath') imgmath_image_format = 'svg' autodoc_default_flags = ['members', 'inherited-members'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # generate autosummary even if no references autosummary_generate = True # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. # source_encoding = 'utf-8-sig' # Generate the plot for the gallery plot_gallery = True # The master toctree document. master_doc = 'index' # General information about the project. project = 'imbalanced-learn' copyright = '2016 - 2017, G. Lemaitre, F. Nogueira, D. Oliveira, C. Aridas' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. from imblearn import __version__ version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ['_build', '_templates'] # The reST default role (used for this markup: `text`) to use for all # documents. default_role = 'literal' # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = False # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # Custom style html_style = 'css/imbalanced-learn.css' # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. # keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. # html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. # html_domain_indices = True # If false, no index is generated. # html_use_index = True # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. # html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'imbalanced-learndoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ ('index', 'imbalanced-learn.tex', 'imbalanced-learn Documentation', 'G. Lemaitre, F. Nogueira, D. Oliveira, C. Aridas', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # If true, show page references after internal links. # latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = False # Documents to append as an appendix to all manuals. # latex_appendices = [] # intersphinx configuration intersphinx_mapping = { 'python': ('https://docs.python.org/{.major}'.format( sys.version_info), None), 'numpy': ('https://docs.scipy.org/doc/numpy/', None), 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), 'matplotlib': ('https://matplotlib.org/', None), 'sklearn': ('http://scikit-learn.org/stable', None) } # sphinx-gallery configuration sphinx_gallery_conf = { 'doc_module': 'imblearn', 'backreferences_dir': os.path.join('generated'), 'show_memory': True, 'reference_url': { 'imblearn': None} } # -- Options for manual page output --------------------------------------- # If false, no module index is generated. # latex_domain_indices = True # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [('index', 'imbalanced-learn', 'imbalanced-learn Documentation', ['G. Lemaitre, F. Nogueira, D. Oliveira, C. Aridas'], 1)] # If true, show URL addresses after external links. # man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'imbalanced-learn', 'imbalanced-learn Documentation', 'G. Lemaitre, F. Nogueira, D. Oliveira, C. Aridas', 'imbalanced-learn', 'Toolbox for imbalanced dataset in machine learning.', 'Miscellaneous'), ] # def generate_example_rst(app, what, name, obj, options, lines): # # generate empty examples files, so that we don't get # # inclusion errors if there are no examples for a class / module # examples_path = os.path.join(app.srcdir, "generated", # "%s.examples" % name) # if not os.path.exists(examples_path): # # touch file # open(examples_path, 'w').close() # Config for sphinx_issues issues_uri = 'https://github.com/scikit-learn-contrib/imbalanced-learn/issues/{issue}' issues_github_path = 'scikit-learn-contrib/imbalanced-learn' issues_user_uri = 'https://github.com/{user}' # Temporary work-around for spacing problem between parameter and parameter # type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug # has been fixed in sphinx (https://github.com/sphinx-doc/sphinx/pull/5976) but # through a change in sphinx basic.css except rtd_theme does not use basic.css. # In an ideal world, this would get fixed in this PR: # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.add_javascript('js/copybutton.js') app.add_stylesheet("basic.css") # app.connect('autodoc-process-docstring', generate_example_rst) # Documents to append as an appendix to all manuals. # texinfo_appendices = [] # If false, no module index is generated. # texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. # texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. # texinfo_no_detailmenu = False # The following is used by sphinx.ext.linkcode to provide links to github linkcode_resolve = make_linkcode_resolve('imblearn', 'https://github.com/scikit-learn-contrib/' 'imbalanced-learn/blob/{revision}/' '{package}/{path}#L{lineno}') imbalanced-learn-0.7.0/doc/datasets/000077500000000000000000000000001366766276300172775ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/datasets/index.rst000066400000000000000000000175741366766276300211560ustar00rootroot00000000000000.. _datasets: ========================= Dataset loading utilities ========================= .. currentmodule:: imblearn.datasets The ``imblearn.datasets`` package is complementing the ``sklearn.datasets`` package. The package provides both: (i) a set of imbalanced datasets to perform systematic benchmark and (ii) a utility to create an imbalanced dataset from an original balanced dataset. .. _zenodo: Imbalanced datasets for benchmark ================================= :func:`fetch_datasets` allows to fetch 27 datasets which are imbalanced and binarized. The following data sets are available: +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ A specific data set can be selected as:: >>> from collections import Counter >>> from imblearn.datasets import fetch_datasets >>> ecoli = fetch_datasets()['ecoli'] >>> ecoli.data.shape (336, 7) >>> print(sorted(Counter(ecoli.target).items())) [(-1, 301), (1, 35)] .. _make_imbalanced: Imbalanced generator ==================== :func:`make_imbalance` turns an original dataset into an imbalanced dataset. This behaviour is driven by the parameter ``sampling_strategy`` which behave similarly to other resampling algorithm. ``sampling_strategy`` can be given as a dictionary where the key corresponds to the class and the value is the number of samples in the class:: >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> iris = load_iris() >>> sampling_strategy = {0: 20, 1: 30, 2: 40} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 20), (1, 30), (2, 40)] Note that all samples of a class are passed-through if the class is not mentioned in the dictionary:: >>> sampling_strategy = {0: 10} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 10), (1, 50), (2, 50)] Instead of a dictionary, a function can be defined and directly pass to ``sampling_strategy``:: >>> def ratio_multiplier(y): ... multiplier = {0: 0.5, 1: 0.7, 2: 0.95} ... target_stats = Counter(y) ... for key, value in target_stats.items(): ... target_stats[key] = int(value * multiplier[key]) ... return target_stats >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=ratio_multiplier) >>> sorted(Counter(y_imb).items()) [(0, 25), (1, 35), (2, 47)] It would also work with pandas dataframe:: >>> from sklearn.datasets import fetch_openml >>> df, y = fetch_openml( ... 'iris', version=1, return_X_y=True, as_frame=True) >>> df_resampled, y_resampled = make_imbalance( ... df, y, sampling_strategy={'Iris-setosa': 10, 'Iris-versicolor': 20}, ... random_state=42) >>> df_resampled.head() sepallength sepalwidth petallength petalwidth 13 4.3 3.0 1.1 0.1 39 5.1 3.4 1.5 0.2 30 4.8 3.1 1.6 0.2 45 4.8 3.0 1.4 0.3 17 5.1 3.5 1.4 0.3 >>> Counter(y_resampled) Counter({'Iris-virginica': 50, 'Iris-versicolor': 20, 'Iris-setosa': 10}) See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`. imbalanced-learn-0.7.0/doc/developers_utils.rst000066400000000000000000000130261366766276300216130ustar00rootroot00000000000000.. _developers-utils: ======================== Utilities for Developers ======================== Imbalanced-learn contains a number of utilities to help with development. These are located in :mod:`imblearn.utils`, and include tools in a number of categories. All the following functions and classes are in the module :mod:`imblearn.utils`. .. warning :: These utilities are meant to be used internally within the imbalanced-learn package. They are not guaranteed to be stable between versions of imbalanced-learn. Backports, in particular, will be removed as the imbalanced-learn dependencies evolve. Validation Tools ================ .. currentmodule:: imblearn.utils These are tools used to check and validate input. When you write a function which accepts arrays, matrices, or sparse matrices as arguments, the following should be used when applicable. - :func:`check_neighbors_object`: Check the objects is consistent to be a NN. - :func:`check_target_type`: Check the target types to be conform to the current sam plers. - :func:`check_sampling_strategy`: Checks that sampling target is onsistent with the type and return a dictionary containing each targeted class with its corresponding number of pixel. Deprecation =========== .. currentmodule:: imblearn.utils.deprecation .. warning :: Apart from :func:`deprecate_parameter` the rest of this section is taken from scikit-learn. Please refer to their original documentation. If any publicly accessible method, function, attribute or parameter is renamed, we still support the old one for two releases and issue a deprecation warning when it is called/passed/accessed. E.g., if the function ``zero_one`` is renamed to ``zero_one_loss``, we add the decorator ``deprecated`` (from ``sklearn.utils``) to ``zero_one`` and call ``zero_one_loss`` from that function:: from ..utils import deprecated def zero_one_loss(y_true, y_pred, normalize=True): # actual implementation pass @deprecated("Function 'zero_one' was renamed to 'zero_one_loss' " "in version 0.13 and will be removed in release 0.15. " "Default behavior is changed from 'normalize=False' to " "'normalize=True'") def zero_one(y_true, y_pred, normalize=False): return zero_one_loss(y_true, y_pred, normalize) If an attribute is to be deprecated, use the decorator ``deprecated`` on a property. E.g., renaming an attribute ``labels_`` to ``classes_`` can be done as:: @property @deprecated("Attribute labels_ was deprecated in version 0.13 and " "will be removed in 0.15. Use 'classes_' instead") def labels_(self): return self.classes_ If a parameter has to be deprecated, use ``DeprecationWarning`` appropriately. In the following example, k is deprecated and renamed to n_clusters:: import warnings def example_function(n_clusters=8, k=None): if k is not None: warnings.warn("'k' was renamed to n_clusters in version 0.13 and " "will be removed in 0.15.", DeprecationWarning) n_clusters = k As in these examples, the warning message should always give both the version in which the deprecation happened and the version in which the old behavior will be removed. If the deprecation happened in version 0.x-dev, the message should say deprecation occurred in version 0.x and the removal will be in 0.(x+2). For example, if the deprecation happened in version 0.18-dev, the message should say it happened in version 0.18 and the old behavior will be removed in version 0.20. In addition, a deprecation note should be added in the docstring, recalling the same information as the deprecation warning as explained above. Use the ``.. deprecated::`` directive:: .. deprecated:: 0.13 ``k`` was renamed to ``n_clusters`` in version 0.13 and will be removed in 0.15. On the top of all the functionality provided by scikit-learn. imbalanced-learn provides :func:`deprecate_parameter`: which is used to deprecate a sampler's parameter (attribute) by another one. Testing utilities ================= Currently, imbalanced-learn provide a warning management utility. This feature is going to be merge in pytest and will be removed when the pytest release will have it. If using Python 2.7 or above, you may use this function as a context manager:: >>> import warnings >>> from imblearn.utils.testing import warns >>> with warns(RuntimeWarning): ... warnings.warn("my runtime warning", RuntimeWarning) >>> with warns(RuntimeWarning): ... pass Traceback (most recent call last): ... Failed: DID NOT WARN. No warnings of type ...RuntimeWarning... was emitted... >>> with warns(RuntimeWarning): ... warnings.warn(UserWarning) Traceback (most recent call last): ... Failed: DID NOT WARN. No warnings of type ...RuntimeWarning... was emitted... In the context manager form you may use the keyword argument ``match`` to assert that the exception matches a text or regex:: >>> import warnings >>> from imblearn.utils.testing import warns >>> with warns(UserWarning, match='must be 0 or None'): ... warnings.warn("value must be 0 or None", UserWarning) >>> with warns(UserWarning, match=r'must be \d+$'): ... warnings.warn("value must be 42", UserWarning) >>> with warns(UserWarning, match=r'must be \d+$'): ... warnings.warn("this is not here", UserWarning) Traceback (most recent call last): ... AssertionError: 'must be \d+$' pattern not found in ['this is not here'] imbalanced-learn-0.7.0/doc/ensemble.rst000066400000000000000000000112501366766276300200120ustar00rootroot00000000000000.. _ensemble: ==================== Ensemble of samplers ==================== .. currentmodule:: imblearn.ensemble .. _ensemble_meta_estimators: Classifier including inner balancing samplers ============================================= .. _bagging: Bagging classifier ------------------ In ensemble classifiers, bagging methods build several estimators on different randomly selected subset of data. In scikit-learn, this classifier is named ``BaggingClassifier``. However, this classifier does not allow to balance each subset of data. Therefore, when training on imbalanced data set, this classifier will favor the majority classes:: >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], class_sep=0.8, ... random_state=0) >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import balanced_accuracy_score >>> from sklearn.ensemble import BaggingClassifier >>> from sklearn.tree import DecisionTreeClassifier >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) >>> bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), ... random_state=0) >>> bc.fit(X_train, y_train) #doctest: +ELLIPSIS BaggingClassifier(...) >>> y_pred = bc.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS 0.77... :class:`BalancedBaggingClassifier` allows to resample each subset of data before to train each estimator of the ensemble. In short, it combines the output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers (i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier` takes the same parameters than the scikit-learn ``BaggingClassifier``. Additionally, there is two additional parameters, ``sampling_strategy`` and ``replacement`` to control the behaviour of the random under-sampler:: >>> from imblearn.ensemble import BalancedBaggingClassifier >>> bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), ... sampling_strategy='auto', ... replacement=False, ... random_state=0) >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS 0.8... .. _forest: Forest of randomized trees -------------------------- :class:`BalancedRandomForestClassifier` is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample :cite:`chen2004using`. This class provides all functionality of the :class:`sklearn.ensemble.RandomForestClassifier` and notably the `feature_importances_` attributes:: >>> from imblearn.ensemble import BalancedRandomForestClassifier >>> brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) >>> brf.fit(X_train, y_train) # doctest: +ELLIPSIS BalancedRandomForestClassifier(...) >>> y_pred = brf.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS 0.8... .. _boosting: Boosting -------- Several methods taking advantage of boosting have been designed. :class:`RUSBoostClassifier` randomly under-sample the dataset before to perform a boosting iteration :cite:`seiffert2009rusboost`:: >>> from imblearn.ensemble import RUSBoostClassifier >>> rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', ... random_state=0) >>> rusboost.fit(X_train, y_train) # doctest: +ELLIPSIS RUSBoostClassifier(...) >>> y_pred = rusboost.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS 0... A specific method which uses ``AdaBoost`` as learners in the bagging classifier is called EasyEnsemble. The :class:`EasyEnsembleClassifier` allows to bag AdaBoost learners which are trained on balanced bootstrap samples :cite:`liu2008exploratory`. Similarly to the :class:`BalancedBaggingClassifier` API, one can construct the ensemble as:: >>> from imblearn.ensemble import EasyEnsembleClassifier >>> eec = EasyEnsembleClassifier(random_state=0) >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS 0.6... .. topic:: Examples * :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_ensemble_classifier.py` imbalanced-learn-0.7.0/doc/index.rst000066400000000000000000000034571366766276300173410ustar00rootroot00000000000000.. project-template documentation master file, created by sphinx-quickstart on Mon Jan 18 14:44:12 2016. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. ########################################## Welcome to imbalanced-learn documentation! ########################################## .. toctree:: :maxdepth: 2 :hidden: :caption: Getting Started install .. toctree:: :maxdepth: 2 :hidden: :caption: Documentation user_guide api .. toctree:: :maxdepth: 2 :hidden: :caption: Tutorial - Examples auto_examples/index .. toctree:: :maxdepth: 1 :hidden: :caption: Addtional Information whats_new about `Getting started `_ --------------------------------- Information to install, test, and contribute to the package. `User Guide `_ ------------------------------- The main documentation. This contains an in-depth description of all algorithms and how to apply them. `API Documentation `_ ------------------------------- The exact API of all functions and classes, as given in the doctring. The API documents expected types and allowed features for all functions, and all parameters available for the algorithms. `Examples `_ -------------------------------------- A set of examples illustrating the use of the different algorithms. It complements the `User Guide `_. `What's new `_ ------------------------------ Log of the imbalanced-learn history. `About imbalanced-learn `_ -------------------------------------- Just to know about history of imbalanced-learn. See the `README `_ for more information. imbalanced-learn-0.7.0/doc/install.rst000066400000000000000000000026371366766276300176770ustar00rootroot00000000000000######################## Install and contribution ######################## Prerequisites ============= The imbalanced-learn package requires the following dependencies: * python (>=3.6) * numpy (>=1.13.3) * scipy (>=0.19.1) * scikit-learn (>=0.23) * keras 2 (optional) * tensorflow (optional) Install ======= imbalanced-learn is currently available on the PyPi's repositories and you can install it via `pip`:: pip install -U imbalanced-learn The package is release also in Anaconda Cloud platform:: conda install -c conda-forge imbalanced-learn If you prefer, you can clone it and run the setup.py file. Use the following commands to get a copy from Github and install all dependencies:: git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git cd imbalanced-learn pip install . Or install using pip and GitHub:: pip install -U git+https://github.com/scikit-learn-contrib/imbalanced-learn.git Test and coverage ================= You want to test the code before to install:: $ make test You wish to test the coverage of your version:: $ make coverage You can also use `pytest`:: $ pytest imblearn -v Contribute ========== You can contribute to this code through Pull Request on GitHub_. Please, make sure that your code is coming with unit tests to ensure full coverage and continuous integration in the API. .. _GitHub: https://github.com/scikit-learn-contrib/imbalanced-learn/pulls imbalanced-learn-0.7.0/doc/introduction.rst000066400000000000000000000041761366766276300207520ustar00rootroot00000000000000.. _introduction: ============ Introduction ============ .. _api_imblearn: API's of imbalanced-learn samplers ---------------------------------- The available samplers follows the scikit-learn API using the base estimator and adding a sampling functionality through the ``sample`` method: :Estimator: The base object, implements a ``fit`` method to learn from data, either:: estimator = obj.fit(data, targets) :Resampler: To resample a data sets, each sampler implements:: data_resampled, targets_resampled = obj.fit_resample(data, targets) Imbalanced-learn samplers accept the same inputs that in scikit-learn: * ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse matrices; * ``targets``: array-like (1-D list, pandas.Series, numpy.array). The output will be of the following type: * ``data_resampled``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse matrices; * ``targets_resampled``: 1-D numpy.array or pd.Series. .. topic:: Sparse input For sparse input the data is **converted to the Compressed Sparse Rows representation** (see ``scipy.sparse.csr_matrix``) before being fed to the sampler. To avoid unnecessary memory copies, it is recommended to choose the CSR representation upstream. .. _problem_statement: Problem statement regarding imbalanced data sets ------------------------------------------------ The learning phase and the subsequent prediction of machine learning algorithms can be affected by the problem of imbalanced data set. The balancing issue corresponds to the difference of the number of samples in the different classes. We illustrate the effect of training a linear SVM classifier with different level of class balancing. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center As expected, the decision function of the linear SVM is highly impacted. With a greater imbalanced ratio, the decision function favor the class with the larger number of samples, usually referred as the majority class. imbalanced-learn-0.7.0/doc/make.bat000066400000000000000000000151011366766276300170720ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\imbalanced-learn.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\imbalanced-learn.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end imbalanced-learn-0.7.0/doc/metrics.rst000066400000000000000000000034011366766276300176650ustar00rootroot00000000000000.. _metrics: ======= Metrics ======= .. currentmodule:: imblearn.metrics Currently, scikit-learn only offers the ``sklearn.metrics.balanced_accuracy_score`` (in 0.20) as metric to deal with imbalanced datasets. The module :mod:`imblearn.metrics` offers a couple of other metrics which are used in the literature to evaluate the quality of classifiers. .. _sensitivity_specificity: Sensitivity and specificity metrics ----------------------------------- Sensitivity and specificity are metrics which are well known in medical imaging. Sensitivity (also called true positive rate or recall) is the proportion of the positive samples which is well classified while specificity (also called true negative rate) is the proportion of the negative samples which are well classified. Therefore, depending of the field of application, either the sensitivity/specificity or the precision/recall pair of metrics are used. Currently, only the `precision and recall metrics `_ are implemented in scikit-learn. :func:`sensitivity_specificity_support`, :func:`sensitivity_score`, and :func:`specificity_score` add the possibility to use those metrics. .. _imbalanced_metrics: Additional metrics specific to imbalanced datasets -------------------------------------------------- The :func:`geometric_mean_score` :cite:`barandela2003strategies,kubat1997addressing` is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. The :func:`make_index_balanced_accuracy` :cite:`garcia2012effectiveness` can wrap any metric and give more importance to a specific class using the parameter ``alpha``. imbalanced-learn-0.7.0/doc/miscellaneous.rst000066400000000000000000000154001366766276300210640ustar00rootroot00000000000000.. _miscellaneous: ====================== Miscellaneous samplers ====================== .. currentmodule:: imblearn .. _function_sampler: Custom samplers --------------- A fully customized sampler, :class:`FunctionSampler`, is available in imbalanced-learn such that you can fast prototype your own sampler by defining a single function. Additional parameters can be added using the attribute ``kw_args`` which accepts a dictionary. The following example illustrates how to retain the 10 first elements of the array ``X`` and ``y``:: >>> import numpy as np >>> from imblearn import FunctionSampler >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) True In addition, the parameter ``validate`` control input checking. For instance, turning ``validate=False`` allows to pass any type of target ``y`` and do some sampling for regression targets. >>> from sklearn.datasets import make_regression >>> X_reg, y_reg = make_regression(n_samples=100, random_state=42) >>> rng = np.random.RandomState(42) >>> def dummy_sampler(X, y): ... indices = rng.choice(np.arange(X.shape[0]), size=10) ... return X[indices], y[indices] >>> sampler = FunctionSampler(func=dummy_sampler, validate=False) >>> X_res, y_res = sampler.fit_resample(X_reg, y_reg) >>> y_res array([ 41.49112498, -142.78526195, 85.55095317, 141.43321419, 75.46571114, -67.49177372, 159.72700509, -169.80498923, 211.95889757, 211.95889757]) We illustrate the use of such sampler to implement an outlier rejection estimator which can be easily used within a :class:`imblearn.pipeline.Pipeline`: :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py` .. _generators: Custom generators ----------------- Imbalanced-learn provides specific generators for TensorFlow and Keras which will generate balanced mini-batches. .. _tensorflow_generator: TensorFlow generator ~~~~~~~~~~~~~~~~~~~~ The :func:`imblearn.tensorflow.balanced_batch_generator` allow to generate balanced mini-batches using an imbalanced-learn sampler which returns indices:: >>> X = X.astype(np.float32) >>> from imblearn.under_sampling import RandomUnderSampler >>> from imblearn.tensorflow import balanced_batch_generator >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sample_weight=None, sampler=RandomUnderSampler(), ... batch_size=10, random_state=42) The ``generator`` and ``steps_per_epoch`` is used during the training of the Tensorflow model. We will illustrate how to use this generator. First, we can define a logistic regression model which will be optimized by a gradient descent:: >>> learning_rate, epochs = 0.01, 10 >>> input_size, output_size = X.shape[1], 3 >>> import tensorflow as tf >>> def init_weights(shape): ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) >>> def accuracy(y_true, y_pred): ... return np.mean(np.argmax(y_pred, axis=1) == y_true) >>> # input and output >>> data = tf.placeholder("float32", shape=[None, input_size]) >>> targets = tf.placeholder("int32", shape=[None]) >>> # build the model and weights >>> W = init_weights([input_size, output_size]) >>> b = init_weights([output_size]) >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) >>> # build the loss, predict, and train operator >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( ... logits=out_act, labels=targets) >>> loss = tf.reduce_sum(cross_entropy) >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) >>> train_op = optimizer.minimize(loss) >>> predict = tf.nn.softmax(out_act) >>> # Initialization of all variables in the graph >>> init = tf.global_variables_initializer() Once initialized, the model is trained by iterating on balanced mini-batches of data and minimizing the loss previously defined:: >>> with tf.Session() as sess: ... print('Starting training') ... sess.run(init) ... for e in range(epochs): ... for i in range(steps_per_epoch): ... X_batch, y_batch = next(training_generator) ... sess.run([train_op, loss], feed_dict={data: X_batch, targets: y_batch}) ... # For each epoch, run accuracy on train and test ... feed_dict = dict() ... predicts_train = sess.run(predict, feed_dict={data: X}) ... print("epoch: {} train accuracy: {:.3f}" ... .format(e, accuracy(y, predicts_train))) ... # doctest: +ELLIPSIS Starting training [... .. _keras_generator: Keras generator ~~~~~~~~~~~~~~~ Keras provides an higher level API in which a model can be defined and train by calling ``fit_generator`` method to train the model. To illustrate, we will define a logistic regression model:: >>> import keras >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.Sequential() >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], ... activation='softmax')) >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', ... metrics=['accuracy']) :func:`imblearn.keras.balanced_batch_generator` creates a balanced mini-batches generator with the associated number of mini-batches which will be generated:: >>> from imblearn.keras import balanced_batch_generator >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) Then, ``fit_generator`` can be called passing the generator and the step:: >>> callback_history = model.fit_generator(generator=training_generator, ... steps_per_epoch=steps_per_epoch, ... epochs=10, verbose=0) The second possibility is to use :class:`imblearn.keras.BalancedBatchGenerator`. Only an instance of this class will be passed to ``fit_generator``:: >>> from imblearn.keras import BalancedBatchGenerator >>> training_generator = BalancedBatchGenerator( ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) >>> callback_history = model.fit_generator(generator=training_generator, ... epochs=10, verbose=0) .. topic:: References * :ref:`sphx_glr_auto_examples_applications_porto_seguro_keras_under_sampling.py`imbalanced-learn-0.7.0/doc/over_sampling.rst000066400000000000000000000276041366766276300210770ustar00rootroot00000000000000.. _over-sampling: ============= Over-sampling ============= .. currentmodule:: imblearn.over_sampling A practical guide ================= You can refer to :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`. .. _random_over_sampler: Naive random over-sampling -------------------------- One way to fight this issue is to generate new samples in the classes which are under-represented. The most naive strategy is to generate new samples by randomly sampling with replacement the current available samples. The :class:`RandomOverSampler` offers such scheme:: >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> from imblearn.over_sampling import RandomOverSampler >>> ros = RandomOverSampler(random_state=0) >>> X_resampled, y_resampled = ros.fit_resample(X, y) >>> from collections import Counter >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] The augmented data set should be used instead of the original data set to train a classifier:: >>> from sklearn.svm import LinearSVC >>> clf = LinearSVC() >>> clf.fit(X_resampled, y_resampled) # doctest : +ELLIPSIS LinearSVC(...) In the figure below, we compare the decision functions of a classifier trained using the over-sampled data set and the original data set. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_002.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center As a result, the majority class does not take over the other classes during the training process. Consequently, all classes are represented by the decision function. In addition, :class:`RandomOverSampler` allows to sample heterogeneous data (e.g. containing some strings):: >>> import numpy as np >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['yyy' 2 2.0] ['zzz' 3 3.0] ['zzz' 3 3.0]] >>> print(y_resampled) [0 0 1 1] It would also work with pandas dataframe:: >>> from sklearn.datasets import fetch_openml >>> df_adult, y_adult = fetch_openml( ... 'adult', version=2, as_frame=True, return_X_y=True) >>> df_adult.head() # doctest: +SKIP >>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult) >>> df_resampled.head() # doctest: +SKIP .. _smote_adasyn: From random over-sampling to SMOTE and ADASYN --------------------------------------------- Apart from the random sampling with replacement, there are two popular methods to over-sample minority classes: (i) the Synthetic Minority Oversampling Technique (SMOTE) :cite:`chawla2002smote` and (ii) the Adaptive Synthetic (ADASYN) :cite:`he2008adasyn` sampling method. These algorithms can be used in the same manner:: >>> from imblearn.over_sampling import SMOTE, ADASYN >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled) >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4673), (1, 4662), (2, 4674)] >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled) The figure below illustrates the major difference of the different over-sampling methods. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center Ill-posed examples ------------------ While the :class:`RandomOverSampler` is over-sampling by duplicating some of the original samples of the minority class, :class:`SMOTE` and :class:`ADASYN` generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, :class:`ADASYN` focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of :class:`SMOTE` will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_004.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :align: center The sampling particularities of these two algorithms can lead to some peculiar behavior as shown below. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_005.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center SMOTE variants -------------- SMOTE might connect inliers and outliers while ADASYN might focus solely on outliers which, in both cases, might lead to a sub-optimal decision function. In this regard, SMOTE offers three additional options to generate samples. Those methods focus on samples near of the border of the optimal decision function and will generate samples in the opposite direction of the nearest neighbors class. Those variants are presented in the figure below. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_006.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center The :class:`BorderlineSMOTE` :cite:`han2005borderline`, :class:`SVMSMOTE` :cite:`nguyen2009borderline`, and :class:`KMeansSMOTE` :cite:`last2017oversampling` offer some variant of the SMOTE algorithm:: >>> from imblearn.over_sampling import BorderlineSMOTE >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] When dealing with mixed data type such as continuous and categorical features, none of the presented methods (apart of the class :class:`RandomOverSampler`) can deal with the categorical features. The :class:`SMOTENC` :cite:`chawla2002smote` is an extension of the :class:`SMOTE` algorithm for which categorical data are treated differently:: >>> # create a synthetic data set with continuous and categorical features >>> rng = np.random.RandomState(42) >>> n_samples = 50 >>> X = np.empty((n_samples, 3), dtype=object) >>> X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object) >>> X[:, 1] = rng.randn(n_samples) >>> X[:, 2] = rng.randint(3, size=n_samples) >>> y = np.array([0] * 20 + [1] * 30) >>> print(sorted(Counter(y).items())) [(0, 20), (1, 30)] In this data set, the first and last features are considered as categorical features. One need to provide this information to :class:`SMOTENC` via the parameters ``categorical_features`` either by passing the indices of these features or a boolean mask marking these features:: >>> from imblearn.over_sampling import SMOTENC >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) >>> X_resampled, y_resampled = smote_nc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 30), (1, 30)] >>> print(X_resampled[-5:]) [['A' 0.5246469549655818 2] ['B' -0.3657680728116921 2] ['B' 0.9344237230779993 2] ['B' 0.3710891618824609 2] ['B' 0.3327240726719727 2]] Therefore, it can be seen that the samples generated in the first and last columns are belonging to the same categories originally presented without any other extra interpolation. Mathematical formulation ======================== Sample generation ----------------- Both SMOTE and ADASYN use the same algorithm to generate new samples. Considering a sample :math:`x_i`, a new sample :math:`x_{new}` will be generated considering its k neareast-neighbors (corresponding to ``k_neighbors``). For instance, the 3 nearest-neighbors are included in the blue circle as illustrated in the figure below. Then, one of these nearest-neighbors :math:`x_{zi}` is selected and a sample is generated as follows: .. math:: x_{new} = x_i + \lambda \times (x_{zi} - x_i) where :math:`\lambda` is a random number in the range :math:`[0, 1]`. This interpolation will create a sample on the line between :math:`x_{i}` and :math:`x_{zi}` as illustrated in the image below: .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_illustration_generation_sample_001.png :target: ./auto_examples/over-sampling/plot_illustration_generation_sample.html :scale: 60 :align: center SMOTE-NC slightly change the way a new sample is generated by performing something specific for the categorical features. In fact, the categories of a new generated sample are decided by picking the most frequent category of the nearest neighbors present during the generation. .. warning:: Be aware that SMOTE-NC is not designed to work with only categorical data. The other SMOTE variants and ADASYN differ from each other by selecting the samples :math:`x_i` ahead of generating the new samples. The **regular** SMOTE algorithm --- cf. to the :class:`SMOTE` object --- does not impose any rule and will randomly pick-up all possible :math:`x_i` available. The **borderline** SMOTE --- cf. to the :class:`BorderlineSMOTE` with the parameters ``kind='borderline-1'`` and ``kind='borderline-2'`` --- will classify each sample :math:`x_i` to be (i) noise (i.e. all nearest-neighbors are from a different class than the one of :math:`x_i`), (ii) in danger (i.e. at least half of the nearest neighbors are from the same class than :math:`x_i`, or (iii) safe (i.e. all nearest neighbors are from the same class than :math:`x_i`). **Borderline-1** and **Borderline-2** SMOTE will use the samples *in danger* to generate new samples. In **Borderline-1** SMOTE, :math:`x_{zi}` will belong to the same class than the one of the sample :math:`x_i`. On the contrary, **Borderline-2** SMOTE will consider :math:`x_{zi}` which can be from any class. **SVM** SMOTE --- cf. to :class:`SVMSMOTE` --- uses an SVM classifier to find support vectors and generate samples considering them. Note that the ``C`` parameter of the SVM classifier allows to select more or less support vectors. For both borderline and SVM SMOTE, a neighborhood is defined using the parameter ``m_neighbors`` to decide if a sample is in danger, safe, or noise. **KMeans** SMOTE --- cf. to :class:`KMeansSMOTE` --- uses a KMeans clustering method before to apply SMOTE. The clustering will group samples together and generate new samples depending of the cluster density. ADASYN works similarly to the regular SMOTE. However, the number of samples generated for each :math:`x_i` is proportional to the number of samples which are not from the same class than :math:`x_i` in a given neighborhood. Therefore, more samples will be generated in the area that the nearest neighbor rule is not respected. The parameter ``m_neighbors`` is equivalent to ``k_neighbors`` in :class:`SMOTE`. Multi-class management ---------------------- All algorithms can be used with multiple classes as well as binary classes classification. :class:`RandomOverSampler` does not require any inter-class information during the sample generation. Therefore, each targeted class is resampled independently. In the contrary, both :class:`ADASYN` and :class:`SMOTE` need information regarding the neighbourhood of each sample used for sample generation. They are using a one-vs-rest approach by selecting each targeted class and computing the necessary statistics against the rest of the data set which are grouped in a single class. imbalanced-learn-0.7.0/doc/references.rst000066400000000000000000000001031366766276300203340ustar00rootroot00000000000000========== References ========== .. bibliography:: bibtex/refs.bibimbalanced-learn-0.7.0/doc/sphinxext/000077500000000000000000000000001366766276300175215ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/sphinxext/LICENSE.txt000066400000000000000000000136231366766276300213510ustar00rootroot00000000000000------------------------------------------------------------------------------- The files - numpydoc.py - autosummary.py - autosummary_generate.py - docscrape.py - docscrape_sphinx.py - phantom_import.py have the following license: Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - compiler_unparse.py - comment_eater.py - traitsdoc.py have the following license: This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. Copyright (c) 2006, Enthought, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Enthought, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - only_directives.py - plot_directive.py originate from Matplotlib (http://matplotlib.sf.net/) which has the following license: Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. imbalanced-learn-0.7.0/doc/sphinxext/MANIFEST.in000066400000000000000000000000531366766276300212550ustar00rootroot00000000000000recursive-include tests *.py include *.txt imbalanced-learn-0.7.0/doc/sphinxext/README.txt000066400000000000000000000032401366766276300212160ustar00rootroot00000000000000===================================== numpydoc -- Numpy's Sphinx extensions ===================================== Numpy's documentation uses several custom extensions to Sphinx. These are shipped in this ``numpydoc`` package, in case you want to make use of them in third-party projects. The following extensions are available: - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add the code description directives ``np-function``, ``np-cfunction``, etc. that support the Numpy docstring syntax. - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` directive. Note that this implementation may still undergo severe changes or eventually be deprecated. - ``numpydoc.only_directives``: (DEPRECATED) - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, and it the Sphinx 1.0 version is recommended over that included in Numpydoc. numpydoc ======== Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings following the Numpy/Scipy format to a form palatable to Sphinx. Options ------- The following options can be set in conf.py: - numpydoc_use_plots: bool Whether to produce ``plot::`` directives for Examples sections that contain ``import matplotlib``. - numpydoc_show_class_members: bool Whether to show all members of a class in the Methods and Attributes sections automatically. - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) Whether to insert an edit link after docstrings. imbalanced-learn-0.7.0/doc/sphinxext/github_link.py000066400000000000000000000051601366766276300223740ustar00rootroot00000000000000from operator import attrgetter import inspect import subprocess import os import sys from functools import partial REVISION_CMD = 'git rev-parse --short HEAD' def _get_git_revision(): try: revision = subprocess.check_output(REVISION_CMD.split()).strip() except (subprocess.CalledProcessError, OSError): print('Failed to execute git to get revision') return None return revision.decode('utf-8') def _linkcode_resolve(domain, info, package, url_fmt, revision): """Determine a link to online source for a class/method/function This is called by sphinx.ext.linkcode An example with a long-untouched module that everyone has >>> _linkcode_resolve('py', {'module': 'tty', ... 'fullname': 'setraw'}, ... package='tty', ... url_fmt='http://hg.python.org/cpython/file/' ... '{revision}/Lib/{package}/{path}#L{lineno}', ... revision='xxxx') 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' """ if revision is None: return if domain not in ('py', 'pyx'): return if not info.get('module') or not info.get('fullname'): return class_name = info['fullname'].split('.')[0] if type(class_name) != str: # Python 2 only class_name = class_name.encode('utf-8') module = __import__(info['module'], fromlist=[class_name]) obj = attrgetter(info['fullname'])(module) try: fn = inspect.getsourcefile(obj) except Exception: fn = None if not fn: try: fn = inspect.getsourcefile(sys.modules[obj.__module__]) except Exception: fn = None if not fn: return fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) try: lineno = inspect.getsourcelines(obj)[1] except Exception: lineno = '' return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) def make_linkcode_resolve(package, url_fmt): """Returns a linkcode_resolve function for the given URL format revision is a git commit reference (hash or name) package is the name of the root module of the package url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 'blob/{revision}/{package}/' '{path}#L{lineno}') """ revision = _get_git_revision() return partial(_linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt) imbalanced-learn-0.7.0/doc/sphinxext/sphinx_issues.py000066400000000000000000000176661366766276300230170ustar00rootroot00000000000000# -*- coding: utf-8 -*- """A Sphinx extension for linking to your project's issue tracker. Copyright 2014 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import re from docutils import nodes, utils from sphinx.util.nodes import split_explicit_title __version__ = "1.2.0" __author__ = "Steven Loria" __license__ = "MIT" def user_role(name, rawtext, text, lineno, inliner, options=None, content=None): """Sphinx role for linking to a user profile. Defaults to linking to Github profiles, but the profile URIS can be configured via the ``issues_user_uri`` config value. Examples: :: :user:`sloria` Anchor text also works: :: :user:`Steven Loria ` """ options = options or {} content = content or [] has_explicit_title, title, target = split_explicit_title(text) target = utils.unescape(target).strip() title = utils.unescape(title).strip() config = inliner.document.settings.env.app.config if config.issues_user_uri: ref = config.issues_user_uri.format(user=target) else: ref = "https://github.com/{0}".format(target) if has_explicit_title: text = title else: text = "@{0}".format(target) link = nodes.reference(text=text, refuri=ref, **options) return [link], [] def cve_role(name, rawtext, text, lineno, inliner, options=None, content=None): """Sphinx role for linking to a CVE on https://cve.mitre.org. Examples: :: :cve:`CVE-2018-17175` """ options = options or {} content = content or [] has_explicit_title, title, target = split_explicit_title(text) target = utils.unescape(target).strip() title = utils.unescape(title).strip() ref = "https://cve.mitre.org/cgi-bin/cvename.cgi?name={0}".format(target) text = title if has_explicit_title else target link = nodes.reference(text=text, refuri=ref, **options) return [link], [] class IssueRole(object): EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$") def __init__( self, uri_config_option, format_kwarg, github_uri_template, format_text=None ): self.uri_config_option = uri_config_option self.format_kwarg = format_kwarg self.github_uri_template = github_uri_template self.format_text = format_text or self.default_format_text @staticmethod def default_format_text(issue_no): return "#{0}".format(issue_no) def make_node(self, name, issue_no, config, options=None): name_map = {"pr": "pull", "issue": "issues", "commit": "commit"} options = options or {} repo_match = self.EXTERNAL_REPO_REGEX.match(issue_no) if repo_match: # External repo username, repo, symbol, issue = repo_match.groups() if name not in name_map: raise ValueError( "External repo linking not supported for :{}:".format(name) ) path = name_map.get(name) ref = "https://github.com/{issues_github_path}/{path}/{n}".format( issues_github_path="{}/{}".format(username, repo), path=path, n=issue ) formatted_issue = self.format_text(issue).lstrip("#") text = "{username}/{repo}{symbol}{formatted_issue}".format(**locals()) link = nodes.reference(text=text, refuri=ref, **options) return link if issue_no not in ("-", "0"): uri_template = getattr(config, self.uri_config_option, None) if uri_template: ref = uri_template.format(**{self.format_kwarg: issue_no}) elif config.issues_github_path: ref = self.github_uri_template.format( issues_github_path=config.issues_github_path, n=issue_no ) else: raise ValueError( "Neither {} nor issues_github_path " "is set".format(self.uri_config_option) ) issue_text = self.format_text(issue_no) link = nodes.reference(text=issue_text, refuri=ref, **options) else: link = None return link def __call__( self, name, rawtext, text, lineno, inliner, options=None, content=None ): options = options or {} content = content or [] issue_nos = [each.strip() for each in utils.unescape(text).split(",")] config = inliner.document.settings.env.app.config ret = [] for i, issue_no in enumerate(issue_nos): node = self.make_node(name, issue_no, config, options=options) ret.append(node) if i != len(issue_nos) - 1: sep = nodes.raw(text=", ", format="html") ret.append(sep) return ret, [] """Sphinx role for linking to an issue. Must have `issues_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :issue:`123` :issue:`42,45` :issue:`sloria/konch#123` """ issue_role = IssueRole( uri_config_option="issues_uri", format_kwarg="issue", github_uri_template="https://github.com/{issues_github_path}/issues/{n}", ) """Sphinx role for linking to a pull request. Must have `issues_pr_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :pr:`123` :pr:`42,45` :pr:`sloria/konch#43` """ pr_role = IssueRole( uri_config_option="issues_pr_uri", format_kwarg="pr", github_uri_template="https://github.com/{issues_github_path}/pull/{n}", ) def format_commit_text(sha): return sha[:7] """Sphinx role for linking to a commit. Must have `issues_pr_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :commit:`123abc456def` :commit:`sloria/konch@123abc456def` """ commit_role = IssueRole( uri_config_option="issues_commit_uri", format_kwarg="commit", github_uri_template="https://github.com/{issues_github_path}/commit/{n}", format_text=format_commit_text, ) def setup(app): # Format template for issues URI # e.g. 'https://github.com/sloria/marshmallow/issues/{issue} app.add_config_value("issues_uri", default=None, rebuild="html") # Format template for PR URI # e.g. 'https://github.com/sloria/marshmallow/pull/{issue} app.add_config_value("issues_pr_uri", default=None, rebuild="html") # Format template for commit URI # e.g. 'https://github.com/sloria/marshmallow/commits/{commit} app.add_config_value("issues_commit_uri", default=None, rebuild="html") # Shortcut for Github, e.g. 'sloria/marshmallow' app.add_config_value("issues_github_path", default=None, rebuild="html") # Format template for user profile URI # e.g. 'https://github.com/{user}' app.add_config_value("issues_user_uri", default=None, rebuild="html") app.add_role("issue", issue_role) app.add_role("pr", pr_role) app.add_role("user", user_role) app.add_role("commit", commit_role) app.add_role("cve", cve_role) return { "version": __version__, "parallel_read_safe": True, "parallel_write_safe": True, } imbalanced-learn-0.7.0/doc/under_sampling.rst000066400000000000000000000404611366766276300212350ustar00rootroot00000000000000.. _under-sampling: ============== Under-sampling ============== .. currentmodule:: imblearn.under_sampling You can refer to :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`. .. _cluster_centroids: Prototype generation ==================== Given an original data set :math:`S`, prototype generation algorithms will generate a new set :math:`S'` where :math:`|S'| < |S|` and :math:`S' \not\subset S`. In other words, prototype generation technique will reduce the number of samples in the targeted classes but the remaining samples are generated --- and not selected --- from the original set. :class:`ClusterCentroids` makes use of K-means to reduce the number of samples. Therefore, each class will be synthesized with the centroids of the K-means method instead of the original samples:: >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> print(sorted(Counter(y).items())) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import ClusterCentroids >>> cc = ClusterCentroids(random_state=0) >>> X_resampled, y_resampled = cc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] The figure below illustrates such under-sampling. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_001.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center :class:`ClusterCentroids` offers an efficient way to represent the data cluster with a reduced number of samples. Keep in mind that this method requires that your data are grouped into clusters. In addition, the number of centroids should be set such that the under-sampled clusters are representative of the original one. .. warning:: :class:`ClusterCentroids` supports sparse matrices. However, the new samples generated are not specifically sparse. Therefore, even if the resulting matrix will be sparse, the algorithm will be inefficient in this regard. Prototype selection =================== On the contrary to prototype generation algorithms, prototype selection algorithms will select samples from the original set :math:`S`. Therefore, :math:`S'` is defined such as :math:`|S'| < |S|` and :math:`S' \in S`. In addition, these algorithms can be divided into two groups: (i) the controlled under-sampling techniques and (ii) the cleaning under-sampling techniques. The first group of methods allows for an under-sampling strategy in which the number of samples in :math:`S'` is specified by the user. By contrast, cleaning under-sampling techniques do not allow this specification and are meant for cleaning the feature space. .. _controlled_under_sampling: Controlled under-sampling techniques ------------------------------------ :class:`RandomUnderSampler` is a fast and easy way to balance the data by randomly selecting a subset of data for the targeted classes:: >>> from imblearn.under_sampling import RandomUnderSampler >>> rus = RandomUnderSampler(random_state=0) >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_002.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center :class:`RandomUnderSampler` allows to bootstrap the data by setting ``replacement`` to ``True``. The resampling with multiple classes is performed by considering independently each targeted class:: >>> import numpy as np >>> print(np.vstack([tuple(row) for row in X_resampled]).shape) (192, 2) >>> rus = RandomUnderSampler(random_state=0, replacement=True) >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(np.vstack(np.unique([tuple(row) for row in X_resampled], axis=0)).shape) (181, 2) In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data (e.g. containing some strings):: >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['zzz' 3 3.0]] >>> print(y_resampled) [0 1] It would also work with pandas dataframe:: >>> from sklearn.datasets import fetch_openml >>> df_adult, y_adult = fetch_openml( ... 'adult', version=2, as_frame=True, return_X_y=True) >>> df_adult.head() # doctest: +SKIP >>> df_resampled, y_resampled = rus.fit_resample(df_adult, y_adult) >>> df_resampled.head() # doctest: +SKIP :class:`NearMiss` adds some heuristic rules to select samples :cite:`mani2003knn`. :class:`NearMiss` implements 3 different types of heuristic which can be selected with the parameter ``version``:: >>> from imblearn.under_sampling import NearMiss >>> nm1 = NearMiss(version=1) >>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] As later stated in the next section, :class:`NearMiss` heuristic rules are based on nearest neighbors algorithm. Therefore, the parameters ``n_neighbors`` and ``n_neighbors_ver3`` accept classifier derived from ``KNeighborsMixin`` from scikit-learn. The former parameter is used to compute the average distance to the neighbors while the latter is used for the pre-selection of the samples of interest. Mathematical formulation ^^^^^^^^^^^^^^^^^^^^^^^^ Let *positive samples* be the samples belonging to the targeted class to be under-sampled. *Negative sample* refers to the samples from the minority class (i.e., the most under-represented class). NearMiss-1 selects the positive samples for which the average distance to the :math:`N` closest samples of the negative class is the smallest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_001.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center NearMiss-2 selects the positive samples for which the average distance to the :math:`N` farthest samples of the negative class is the smallest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_002.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center NearMiss-3 is a 2-steps algorithm. First, for each negative sample, their :math:`M` nearest-neighbors will be kept. Then, the positive samples selected are the one for which the average distance to the :math:`N` nearest-neighbors is the largest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_003.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center In the next example, the different :class:`NearMiss` variant are applied on the previous toy example. It can be seen that the decision functions obtained in each case are different. When under-sampling a specific class, NearMiss-1 can be altered by the presence of noise. In fact, it will implied that samples of the targeted class will be selected around these samples as it is the case in the illustration below for the yellow class. However, in the normal case, samples next to the boundaries will be selected. NearMiss-2 will not have this effect since it does not focus on the nearest samples but rather on the farthest samples. We can imagine that the presence of noise can also altered the sampling mainly in the presence of marginal outliers. NearMiss-3 is probably the version which will be less affected by noise due to the first step sample selection. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_003.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center Cleaning under-sampling techniques ---------------------------------- Cleaning under-sampling techniques do not allow to specify the number of samples to have in each class. In fact, each algorithm implement an heuristic which will clean the dataset. .. _tomek_links: Tomek's links ^^^^^^^^^^^^^ :class:`TomekLinks` detects the so-called Tomek's links :cite:`tomek1976two`. A Tomek's link between two samples of different class :math:`x` and :math:`y` is defined such that for any sample :math:`z`: .. math:: d(x, y) < d(x, z) \text{ and } d(x, y) < d(y, z) where :math:`d(.)` is the distance between the two samples. In some other words, a Tomek's link exist if the two samples are the nearest neighbors of each other. In the figure below, a Tomek's link is illustrated by highlighting the samples of interest in green. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_tomek_links_001.png :target: ./auto_examples/under-sampling/plot_illustration_tomek_links.html :scale: 60 :align: center The parameter ``sampling_strategy`` control which sample of the link will be removed. For instance, the default (i.e., ``sampling_strategy='auto'``) will remove the sample from the majority class. Both samples from the majority and minority class can be removed by setting ``sampling_strategy`` to ``'all'``. The figure illustrates this behaviour. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_tomek_links_002.png :target: ./auto_examples/under-sampling/plot_illustration_tomek_links.html :scale: 60 :align: center .. _edited_nearest_neighbors: Edited data set using nearest neighbours ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`EditedNearestNeighbours` applies a nearest-neighbors algorithm and "edit" the dataset by removing samples which do not agree "enough" with their neighboorhood :cite:`wilson1972asymptotic`. For each sample in the class to be under-sampled, the nearest-neighbours are computed and if the selection criterion is not fulfilled, the sample is removed. Two selection criteria are currently available: (i) the majority (i.e., ``kind_sel='mode'``) or (ii) all (i.e., ``kind_sel='all'``) the nearest-neighbors have to belong to the same class than the sample inspected to keep it in the dataset:: >>> sorted(Counter(y).items()) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import EditedNearestNeighbours >>> enn = EditedNearestNeighbours() >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 213), (2, 4568)] The parameter ``n_neighbors`` allows to give a classifier subclassed from ``KNeighborsMixin`` from scikit-learn to find the nearest neighbors and make the decision to keep a given sample or not. :class:`RepeatedEditedNearestNeighbours` extends :class:`EditedNearestNeighbours` by repeating the algorithm multiple times :cite:`tomek1976experiment`. Generally, repeating the algorithm will delete more data:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours() >>> X_resampled, y_resampled = renn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] :class:`AllKNN` differs from the previous :class:`RepeatedEditedNearestNeighbours` since the number of neighbors of the internal nearest neighbors algorithm is increased at each iteration :cite:`tomek1976experiment`:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() >>> X_resampled, y_resampled = allknn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 220), (2, 4601)] In the example below, it can be seen that the three algorithms have similar impact by cleaning noisy samples next to the boundaries of the classes. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_004.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center .. _condensed_nearest_neighbors: Condensed nearest neighbors and derived algorithms ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`CondensedNearestNeighbour` uses a 1 nearest neighbor rule to iteratively decide if a sample should be removed or not :cite:`hart1968condensed`. The algorithm is running as followed: 1. Get all minority samples in a set :math:`C`. 2. Add a sample from the targeted class (class to be under-sampled) in :math:`C` and all other samples of this class in a set :math:`S`. 3. Go through the set :math:`S`, sample by sample, and classify each sample using a 1 nearest neighbor rule. 4. If the sample is misclassified, add it to :math:`C`, otherwise do nothing. 5. Reiterate on :math:`S` until there is no samples to be added. The :class:`CondensedNearestNeighbour` can be used in the following manner:: >>> from imblearn.under_sampling import CondensedNearestNeighbour >>> cnn = CondensedNearestNeighbour(random_state=0) >>> X_resampled, y_resampled = cnn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 24), (2, 115)] However as illustrated in the figure below, :class:`CondensedNearestNeighbour` is sensitive to noise and will add noisy samples. In the contrary, :class:`OneSidedSelection` will use :class:`TomekLinks` to remove noisy samples :cite:`hart1968condensed`. In addition, the 1 nearest neighbor rule is applied to all samples and the one which are misclassified will be added to the set :math:`C`. No iteration on the set :math:`S` will take place. The class can be used as:: >>> from imblearn.under_sampling import OneSidedSelection >>> oss = OneSidedSelection(random_state=0) >>> X_resampled, y_resampled = oss.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 174), (2, 4404)] Our implementation offer to set the number of seeds to put in the set :math:`C` originally by setting the parameter ``n_seeds_S``. :class:`NeighbourhoodCleaningRule` will focus on cleaning the data than condensing them :cite:`laurikkala2001improving`. Therefore, it will used the union of samples to be rejected between the :class:`EditedNearestNeighbours` and the output a 3 nearest neighbors classifier. The class can be used as:: >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> ncr = NeighbourhoodCleaningRule() >>> X_resampled, y_resampled = ncr.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 234), (2, 4666)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_005.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center .. _instance_hardness_threshold: Instance hardness threshold ^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`InstanceHardnessThreshold` is a specific algorithm in which a classifier is trained on the data and the samples with lower probabilities are removed :cite:`smith2014instance`. The class can be used as:: >>> from sklearn.linear_model import LogisticRegression >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression( ... solver='lbfgs', multi_class='auto')) >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] This class has 2 important parameters. ``estimator`` will accept any scikit-learn classifier which has a method ``predict_proba``. The classifier training is performed using a cross-validation and the parameter ``cv`` can set the number of folds to use. .. note:: :class:`InstanceHardnessThreshold` could almost be considered as a controlled under-sampling method. However, due to the probability outputs, it is not always possible to get a specific number of samples. The figure below gives another examples on some toy data. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_006.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center imbalanced-learn-0.7.0/doc/user_guide.rst000066400000000000000000000005161366766276300203560ustar00rootroot00000000000000.. title:: User guide: contents .. _user_guide: ========== User Guide ========== .. toctree:: :numbered: introduction.rst over_sampling.rst under_sampling.rst combine.rst ensemble.rst miscellaneous.rst metrics.rst Dataset loading utilities developers_utils.rst references.rst imbalanced-learn-0.7.0/doc/whats_new.rst000066400000000000000000000004641366766276300202240ustar00rootroot00000000000000.. currentmodule:: imblearn =============== Release history =============== .. include:: whats_new/v0.7.rst .. include:: whats_new/v0.6.rst .. include:: whats_new/v0.5.rst .. include:: whats_new/v0.4.rst .. include:: whats_new/v0.3.rst .. include:: whats_new/v0.2.rst .. include:: whats_new/v0.1.rst imbalanced-learn-0.7.0/doc/whats_new/000077500000000000000000000000001366766276300174665ustar00rootroot00000000000000imbalanced-learn-0.7.0/doc/whats_new/v0.1.rst000066400000000000000000000021741366766276300207100ustar00rootroot00000000000000.. _changes_0_1: Version 0.1 =========== Changelog --------- API ~~~ - First release of the stable API. By :user;`Fernando Nogueira `, :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. New methods ~~~~~~~~~~~ * Under-sampling 1. Random majority under-sampling with replacement 2. Extraction of majority-minority Tomek links 3. Under-sampling with Cluster Centroids 4. NearMiss-(1 & 2 & 3) 5. Condensend Nearest Neighbour 6. One-Sided Selection 7. Neighboorhood Cleaning Rule 8. Edited Nearest Neighbours 9. Instance Hardness Threshold 10. Repeated Edited Nearest Neighbours * Over-sampling 1. Random minority over-sampling with replacement 2. SMOTE - Synthetic Minority Over-sampling Technique 3. bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2 4. SVM SMOTE - Support Vectors SMOTE 5. ADASYN - Adaptive synthetic sampling approach for imbalanced learning * Over-sampling followed by under-sampling 1. SMOTE + Tomek links 2. SMOTE + ENN * Ensemble sampling 1. EasyEnsemble 2. BalanceCascade imbalanced-learn-0.7.0/doc/whats_new/v0.2.rst000066400000000000000000000136371366766276300207170ustar00rootroot00000000000000.. _changes_0_2: Version 0.2 =========== Changelog --------- Bug fixes ~~~~~~~~~ - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.AllKNN`, add stopping criteria to avoid that the minority class become a majority class or that a class disappear. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the list of indices returned. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`ensemble.BalanceCascade`, solve the issue to obtain a single array if desired. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`pipeline.Pipeline`, solve to embed `Pipeline` in other `Pipeline`. :issue:`231` by :user:`Christos Aridas `. - Fixed a bug in :class:`pipeline.Pipeline`, solve the issue to put to sampler in the same `Pipeline`. :issue:`188` by :user:`Christos Aridas `. - Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the shape of `sel_x` when only one sample is selected. By :user:`Aliaksei Halachkin `. - Fixed a bug in :class:`under_sampling.NeighbourhoodCleaningRule`, selecting neighbours instead of minority class misclassified samples. :issue:`230` by :user:`Aleksandr Loskutov `. - Fixed a bug in :class:`over_sampling.ADASYN`, correction of the creation of a new sample so that the new sample lies between the minority sample and the nearest neighbour. :issue:`235` by :user:`Rafael Wampfler `. New features ~~~~~~~~~~~~ - Added AllKNN under sampling technique. By :user:`Dayvid Oliveira `. - Added a module `metrics` implementing some specific scoring function for the problem of balancing. :issue:`204` by :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. Enhancement ~~~~~~~~~~~ - Added support for bumpversion. By :user:`Guillaume Lemaitre `. - Validate the type of target in binary samplers. A warning is raised for the moment. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Change from `cross_validation` module to `model_selection` module for `sklearn` deprecation cycle. By :user:`Dayvid Oliveira ` and :user:`Christos Aridas `. API changes summary ~~~~~~~~~~~~~~~~~~~ - `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.CondensedNeareastNeigbour`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.OneSidedSelection`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.NeighbourhoodCleaningRule`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.RepeatedEditedNearestNeighbours`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.AllKNN`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By :user:`Guillaume Lemaitre `. - Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. :issue:`183` by :user:`Guillaume Lemaitre `. - `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. :issue:`182` by :user:`Guillaume Lemaitre `. - `n_neighbors` accept `KNeighborsMixin` based object for :class:`under_sampling.EditedNearestNeighbors`, :class:`under_sampling.CondensedNeareastNeigbour`, :class:`under_sampling.NeighbourhoodCleaningRule`, :class:`under_sampling.RepeatedEditedNearestNeighbours`, and :class:`under_sampling.AllKNN`. :issue:`109` by :user:`Guillaume Lemaitre `. Documentation changes ~~~~~~~~~~~~~~~~~~~~~ - Replace some remaining `UnbalancedDataset` occurences. By :user:`Francois Magimel `. - Added doctest in the documentation. By :user:`Guillaume Lemaitre `. imbalanced-learn-0.7.0/doc/whats_new/v0.3.rst000066400000000000000000000064451366766276300207170ustar00rootroot00000000000000.. _changes_0_3: Version 0.3 =========== Changelog --------- Testing ~~~~~~~ - Pytest is used instead of nosetests. :issue:`321` by :user:`Joan Massich `. Documentation ~~~~~~~~~~~~~ - Added a User Guide and extended some examples. :issue:`295` by :user:`Guillaume Lemaitre `. Bug fixes ~~~~~~~~~ - Fixed a bug in :func:`utils.check_ratio` such that an error is raised when the number of samples required is negative. :issue:`312` by :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices returned were wrong. :issue:`312` by :user:`Guillaume Lemaitre `. - Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` and :class:`SMOTETomek`. :issue:`295` by :user:`Guillaume Lemaitre `. - Fixed bug for `check_ratio` to be able to pass arguments when `ratio` is a callable. :issue:`307` by :user:`Guillaume Lemaitre `. New features ~~~~~~~~~~~~ - Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By :user:`Christos Aridas `. - Add a fetching function :func:`datasets.fetch_datasets` in order to get some imbalanced datasets useful for benchmarking. :issue:`249` by :user:`Guillaume Lemaitre `. Enhancement ~~~~~~~~~~~ - All samplers accepts sparse matrices with defaulting on CSR type. :issue:`316` by :user:`Guillaume Lemaitre `. - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It supports multiclass. :issue:`312` by :user:`Guillaume Lemaitre `. - All the unit tests have been factorized and a :func:`utils.check_estimators` has been derived from scikit-learn. By :user:`Guillaume Lemaitre `. - Script for automatic build of conda packages and uploading. :issue:`242` by :user:`Guillaume Lemaitre ` - Remove seaborn dependence and improve the examples. :issue:`264` by :user:`Guillaume Lemaitre `. - adapt all classes to multi-class resampling. :issue:`290` by :user:`Guillaume Lemaitre ` API changes summary ~~~~~~~~~~~~~~~~~~~ - `__init__` has been removed from the :class:`base.SamplerMixin` to create a real mixin class. :issue:`242` by :user:`Guillaume Lemaitre `. - creation of a module :mod:`exceptions` to handle consistant raising of errors. :issue:`242` by :user:`Guillaume Lemaitre `. - creation of a module ``utils.validation`` to make checking of recurrent patterns. :issue:`242` by :user:`Guillaume Lemaitre `. - move the under-sampling methods in ``prototype_selection`` and ``prototype_generation`` submodule to make a clearer dinstinction. :issue:`277` by :user:`Guillaume Lemaitre `. - change ``ratio`` such that it can adapt to multiple class problems. :issue:`290` by :user:`Guillaume Lemaitre `. Deprecation ~~~~~~~~~~~ - Deprecation of the use of ``min_c_`` in :func:`datasets.make_imbalance`. :issue:`312` by :user:`Guillaume Lemaitre ` - Deprecation of the use of float in :func:`datasets.make_imbalance` for the ratio parameter. :issue:`290` by :user:`Guillaume Lemaitre `. - deprecate the use of float as ratio in favor of dictionary, string, or callable. :issue:`290` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.7.0/doc/whats_new/v0.4.rst000066400000000000000000000206301366766276300207100ustar00rootroot00000000000000.. _changes_0_4: Version 0.4.2 ============= Changelog --------- Bug fixes ......... - Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which the the median of the standard deviation instead of half of the median of the standard deviation. By :user:`Guillaume Lemaitre ` in :issue:`491`. - Raise an error when passing target which is not supported, i.e. regression target or multilabel targets. Imbalanced-learn does not support this case. By :user:`Guillaume Lemaitre ` in :issue:`490`. - Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which a sparse matrices were densify during ``inverse_transform``. By :user:`Guillaume Lemaitre ` in :issue:`495`. - Fix a bug in :class:`imblearn.over_sampling.SMOTE_NC` in which a the tie breaking was wrongly sampling. By :user:`Guillaume Lemaitre ` in :issue:`497`. Version 0.4 =========== **October, 2018** .. warning:: Version 0.4 is the last version of imbalanced-learn to support Python 2.7 and Python 3.4. Imbalanced-learn 0.5 will require Python 3.5 or higher. Highlights ---------- This release brings its set of new feature as well as some API changes to strengthen the foundation of imbalanced-learn. As new feature, 2 new modules :mod:`imblearn.keras` and :mod:`imblearn.tensorflow` have been added in which imbalanced-learn samplers can be used to generate balanced mini-batches. The module :mod:`imblearn.ensemble` has been consolidated with new classifier: :class:`imblearn.ensemble.BalancedRandomForestClassifier`, :class:`imblearn.ensemble.EasyEnsembleClassifier`, :class:`imblearn.ensemble.RUSBoostClassifier`. Support for string has been added in :class:`imblearn.over_sampling.RandomOverSampler` and :class:`imblearn.under_sampling.RandomUnderSampler`. In addition, a new class :class:`imblearn.over_sampling.SMOTENC` allows to generate sample with data sets containing both continuous and categorical features. The :class:`imblearn.over_sampling.SMOTE` has been simplified and break down to 2 additional classes: :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE`. There is also some changes regarding the API: the parameter ``sampling_strategy`` has been introduced to replace the ``ratio`` parameter. In addition, the ``return_indices`` argument has been deprecated and all samplers will exposed a ``sample_indices_`` whenever this is possible. Changelog --------- API ... - Replace the parameter ``ratio`` by ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Enable to use a ``float`` with binary classification for ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Enable to use a ``list`` for the cleaning methods to specify the class to sample. :issue:`411` by :user:`Guillaume Lemaitre `. - Replace ``fit_sample`` by ``fit_resample``. An alias is still available for backward compatibility. In addition, ``sample`` has been removed to avoid resampling on different set of data. :issue:`462` by :user:`Guillaume Lemaitre `. New features ............ - Add a :mod:`keras` and :mod:`tensorflow` modules to create balanced mini-batches generator. :issue:`409` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.EasyEnsembleClassifier` which create a bag of AdaBoost classifier trained on balanced bootstrap samples. :issue:`455` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.BalancedRandomForestClassifier` which balanced each bootstrap provided to each tree of the forest. :issue:`459` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.RUSBoostClassifier` which applied a random under-sampling stage before each boosting iteration of AdaBoost. :issue:`469` by :user:`Guillaume Lemaitre `. - Add :class:`imblern.over_sampling.SMOTENC` which generate synthetic samples on data set with heterogeneous data type (continuous and categorical features). :issue:`412` by :user:`Denis Dudnik ` and :user:`Guillaume Lemaitre `. Enhancement ........... - Add a documentation node to create a balanced random forest from a balanced bagging classifier. :issue:`372` by :user:`Guillaume Lemaitre `. - Document the metrics to evaluate models on imbalanced dataset. :issue:`367` by :user:`Guillaume Lemaitre `. - Add support for one-vs-all encoded target to support keras. :issue:`409` by :user:`Guillaume Lemaitre `. - Adding specific class for borderline and SVM SMOTE using :class:`BorderlineSMOTE` and :class:`SVMSMOTE`. :issue:`440` by :user:`Guillaume Lemaitre `. - Allow :class:`imblearn.over_sampling.RandomOverSampler` can return indices using the attributes ``return_indices``. :issue:`439` by :user:`Hugo Gascon` and :user:`Guillaume Lemaitre `. - Allow :class:`imblearn.under_sampling.RandomUnderSampler` and :class:`imblearn.over_sampling.RandomOverSampler` to sample object array containing strings. :issue:`451` by :user:`Guillaume Lemaitre `. Bug fixes ......... - Fix bug in :func:`metrics.classification_report_imbalanced` for which `y_pred` and `y_true` where inversed. :issue:`394` by :user:`Ole Silvig .` - Fix bug in ADASYN to consider only samples from the current class when generating new samples. :issue:`354` by :user:`Guillaume Lemaitre `. - Fix bug which allow for sorted behavior of ``sampling_strategy`` dictionary and thus to obtain a deterministic results when using the same random state. :issue:`447` by :user:`Guillaume Lemaitre `. - Force to clone scikit-learn estimator passed as attributes to samplers. :issue:`446` by :user:`Guillaume Lemaitre `. - Fix bug which was not preserving the dtype of X and y when generating samples. :issue:`450` by :user:`Guillaume Lemaitre `. - Add the option to pass a ``Memory`` object to :func:`make_pipeline` like in :class:`pipeline.Pipeline` class. :issue:`458` by :user:`Christos Aridas `. Maintenance ........... - Remove deprecated parameters in 0.2 - :issue:`331` by :user:`Guillaume Lemaitre `. - Make some modules private. :issue:`452` by :user:`Guillaume Lemaitre `. - Upgrade requirements to scikit-learn 0.20. :issue:`379` by :user:`Guillaume Lemaitre `. - Catch deprecation warning in testing. :issue:`441` by :user:`Guillaume Lemaitre `. - Refactor and impose `pytest` style tests. :issue:`470` by :user:`Guillaume Lemaitre `. Documentation ............. - Remove some docstring which are not necessary. :issue:`454` by :user:`Guillaume Lemaitre `. - Fix the documentation of the ``sampling_strategy`` parameters when used as a float. :issue:`480` by :user:`Guillaume Lemaitre `. Deprecation ........... - Deprecate ``ratio`` in favor of ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Deprecate the use of a ``dict`` for cleaning methods. a ``list`` should be used. :issue:`411` by :user:`Guillaume Lemaitre `. - Deprecate ``random_state`` in :class:`imblearn.under_sampling.NearMiss`, :class:`imblearn.under_sampling.EditedNearestNeighbors`, :class:`imblearn.under_sampling.RepeatedEditedNearestNeighbors`, :class:`imblearn.under_sampling.AllKNN`, :class:`imblearn.under_sampling.NeighbourhoodCleaningRule`, :class:`imblearn.under_sampling.InstanceHardnessThreshold`, :class:`imblearn.under_sampling.CondensedNearestNeighbours`. - Deprecate ``kind``, ``out_step``, ``svm_estimator``, ``m_neighbors`` in :class:`imblearn.over_sampling.SMOTE`. User should use :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE`. :issue:`440` by :user:`Guillaume Lemaitre `. - Deprecate :class:`imblearn.ensemble.EasyEnsemble` in favor of meta-estimator :class:`imblearn.ensemble.EasyEnsembleClassifier` which follow the exact algorithm described in the literature. :issue:`455` by :user:`Guillaume Lemaitre `. - Deprecate :class:`imblearn.ensemble.BalanceCascade`. :issue:`472` by :user:`Guillaume Lemaitre `. - Deprecate ``return_indices`` in all samplers. Instead, an attribute ``sample_indices_`` is created whenever the sampler is selecting a subset of the original samples. :issue:`474` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.over_sampling.BorderlineSMOTE` and :class:`imblearn.over_sampling.SVMSMOTE` in the API documenation. :issue:`530` by :user:`Guillaume Lemaitre `. Enhancement ........... - Add Parallelisation for SMOTEENN and SMOTETomek. :pr:`547` by :user:`Michael Hsieh `. - Add :class:`imblearn.utils._show_versions`. Updated the contribution guide and issue template showing how to print system and dependency information from the command line. :pr:`557` by :user:`Alexander L. Hayes `. - Add :class:`imblearn.over_sampling.KMeansSMOTE` which is an over-sampler clustering points before to apply SMOTE. :pr:`435` by :user:`Stephan Heijl `. Maintenance ........... - Make it possible to ``import imblearn`` and access submodule. :pr:`500` by :user:`Guillaume Lemaitre `. - Remove support for Python 2, remove deprecation warning from scikit-learn 0.21. :pr:`576` by :user:`Guillaume Lemaitre `. Bug ... - Fix wrong usage of :class:`keras.layers.BatchNormalization` in ``porto_seguro_keras_under_sampling.py`` example. The batch normalization was moved before the activation function and the bias was removed from the dense layer. :pr:`531` by :user:`Guillaume Lemaitre `. - Fix bug which converting to COO format sparse when stacking the matrices in :class:`imblearn.over_sampling.SMOTENC`. This bug was only old scipy version. :pr:`539` by :user:`Guillaume Lemaitre `. - Fix bug in :class:`imblearn.pipeline.Pipeline` where None could be the final estimator. :pr:`554` by :user:`Oliver Rausch `. - Fix bug in :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE` where the default parameter of ``n_neighbors`` was not set properly. :pr:`578` by :user:`Guillaume Lemaitre `. - Fix bug by changing the default depth in :class:`imblearn.ensemble.RUSBoostClassifier` to get a decision stump as a weak learner as in the original paper. :pr:`545` by :user:`Christos Aridas `. - Allow to import ``keras`` directly from ``tensorflow`` in the :mod:`imblearn.keras`. :pr:`531` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.7.0/doc/whats_new/v0.6.rst000066400000000000000000000114461366766276300207170ustar00rootroot00000000000000.. _changes_0_6_2: Version 0.6.2 ============== This is a bug-fix release to resolve some issues regarding the handling the input and the output format of the arrays. Changelog --------- - Allow column vectors to be passed as targets. :pr:`673` by :user:`Christos Aridas `. - Better input/output handling for pandas, numpy and plain lists. :pr:`681` by :user:`Christos Aridas `. .. _changes_0_6_1: Version 0.6.1 ============== This is a bug-fix release to primarily resolve some packaging issues in version 0.6.0. It also includes minor documentation improvements and some bug fixes. Changelog --------- Bug fixes ......... - Fix a bug in :class:`imblearn.ensemble.BalancedRandomForestClassifier` leading to a wrong number of samples used during fitting due `max_samples` and therefore a bad computation of the OOB score. :pr:`656` by :user:`Guillaume Lemaitre `. .. _changes_0_6: Version 0.6.0 ============= Changelog --------- Changed models .............. The following models might give some different sampling due to changes in scikit-learn: - :class:`imblearn.under_sampling.ClusterCentroids` - :class:`imblearn.under_sampling.InstanceHardnessThreshold` The following samplers will give different results due to change linked to the random state internal usage: - :class:`imblearn.over_sampling.ADASYN` - :class:`imblearn.over_sampling.SMOTENC` Bug fixes ......... - :class:`imblearn.under_sampling.InstanceHardnessThreshold` now take into account the `random_state` and will give deterministic results. In addition, `cross_val_predict` is used to take advantage of the parallelism. :pr:`599` by :user:`Shihab Shahriar Khan `. - Fix a bug in :class:`imblearn.ensemble.BalancedRandomForestClassifier` leading to a wrong computation of the OOB score. :pr:`656` by :user:`Guillaume Lemaitre `. Maintenance ........... - Update imports from scikit-learn after that some modules have been privatize. The following import have been changed: :class:`sklearn.ensemble._base._set_random_states`, :class:`sklearn.ensemble._forest._parallel_build_trees`, :class:`sklearn.metrics._classification._check_targets`, :class:`sklearn.metrics._classification._prf_divide`, :class:`sklearn.utils.Bunch`, :class:`sklearn.utils._safe_indexing`, :class:`sklearn.utils._testing.assert_allclose`, :class:`sklearn.utils._testing.assert_array_equal`, :class:`sklearn.utils._testing.SkipTest`. :pr:`617` by :user:`Guillaume Lemaitre `. - Synchronize :mod:`imblearn.pipeline` with :mod:`sklearn.pipeline`. :pr:`620` by :user:`Guillaume Lemaitre `. - Synchronize :class:`imblearn.ensemble.BalancedRandomForestClassifier` and add parameters `max_samples` and `ccp_alpha`. :pr:`621` by :user:`Guillaume Lemaitre `. Enhancement ........... - :class:`imblearn.under_sampling.RandomUnderSampling`, :class:`imblearn.over_sampling.RandomOverSampling`, :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and will output Pandas Series. :pr:`636` by :user:`Guillaume Lemaitre `. - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing to check or not the input ``X`` and ``y``. :pr:`637` by :user:`Guillaume Lemaitre `. - :class:`imblearn.under_sampling.RandomUnderSampler`, :class:`imblearn.over_sampling.RandomOverSampler` can resample when non finite values are present in ``X``. :pr:`643` by :user:`Guillaume Lemaitre `. - All samplers will output a Pandas DataFrame if a Pandas DataFrame was given as an input. :pr:`644` by :user:`Guillaume Lemaitre `. - The samples generation in :class:`imblearn.over_sampling.ADASYN`, :class:`imblearn.over_sampling.SMOTE`, :class:`imblearn.over_sampling.BorderlineSMOTE`, :class:`imblearn.over_sampling.SVMSMOTE`, :class:`imblearn.over_sampling.KMeansSMOTE`, :class:`imblearn.over_sampling.SMOTENC` is now vectorize with giving an additional speed-up when `X` in sparse. :pr:`596` and :pr:`649` by :user:`Matt Eding `. Deprecation ........... - The following classes have been removed after 2 deprecation cycles: `ensemble.BalanceCascade` and `ensemble.EasyEnsemble`. :pr:`617` by :user:`Guillaume Lemaitre `. - The following functions have been removed after 2 deprecation cycles: `utils.check_ratio`. :pr:`617` by :user:`Guillaume Lemaitre `. - The parameter `ratio` and `return_indices` has been removed from all samplers. :pr:`617` by :user:`Guillaume Lemaitre `. - The parameters `m_neighbors`, `out_step`, `kind`, `svm_estimator` have been removed from the :class:`imblearn.over_sampling.SMOTE`. :pr:`617` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.7.0/doc/whats_new/v0.7.rst000066400000000000000000000045621366766276300207210ustar00rootroot00000000000000.. _changes_0_7: Version 0.7.0 ============= Changelog --------- Maintenance ........... - Ensure that :class:`imblearn.pipeline.Pipeline` is working when `memory` is activated and `joblib==0.11`. :pr:`687` by :user:`Christos Aridas `. - Refactor common test to use the dev tools from `scikit-learn` 0.23. :pr:`710` by :user:`Guillaume Lemaitre `. - Remove `FutureWarning` issued by `scikit-learn` 0.23. :pr:`710` by :user:`Guillaume Lemaitre `. - Impose keywords only argument as in `scikit-learn`. :pr:`721` by :user:`Guillaume Lemaitre `. Changed models .............. The following models might give some different results due to changes: - :class:`imblearn.ensemble.BalancedRandomForestClassifier` Bug fixes ......... - Change the default value `min_samples_leaf` to be consistent with scikit-learn. :pr:`711` by :user:`zerolfx `. - Fix a bug due to change in `scikit-learn` 0.23 in :class:`imblearn.metrics.make_index_balanced_accuracy`. The function was unusable. :pr:`710` by :user:`Guillaume Lemaitre `. - Raise a proper error message when only numerical or categorical features are given in :class:`imblearn.over_sampling.SMOTENC`. :pr:`720` by :user:`Guillaume Lemaitre `. - Fix a bug when the median of the standard deviation is null in :class:`imblearn.over_sampling.SMOTENC`. :pr:`675` by :user:`bganglia `. Enhancements ............ - The classifier implemented in imbalanced-learn, :class:`imblearn.ensemble.BalancedBaggingClassifier`, :class:`imblearn.ensemble.BalancedRandomForestClassifier`, :class:`imblearn.ensemble.EasyEnsembleClassifier`, and :class:`imblearn.ensemble.RUSBoostClassifier`, accept `sampling_strategy` with the same key than in `y` without the need of encoding `y` in advance. :pr:`718` by :user:`Guillaume Lemaitre `. - Lazy import `keras` module when importing `imblearn.keras` :pr:`719` by :user:`Guillaume Lemaitre `. Deprecation ........... - Deprecation of the parameters `n_jobs` in :class:`imblearn.under_sampling.ClusterCentroids` since it was used by :class:`sklearn.cluster.KMeans` which deprecated it. :pr:`710` by :user:`Guillaume Lemaitre `. - Deprecation of passing keyword argument by position similarly to `scikit-learn`. :pr:`721` by :user:`Guillaume lemaitre `. imbalanced-learn-0.7.0/examples/000077500000000000000000000000001366766276300165405ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/README.txt000066400000000000000000000002101366766276300202270ustar00rootroot00000000000000.. _general_examples: General examples ---------------- General-purpose and introductory examples for the `imbalanced-learn` toolbox. imbalanced-learn-0.7.0/examples/applications/000077500000000000000000000000001366766276300212265ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/applications/README.txt000066400000000000000000000002131366766276300227200ustar00rootroot00000000000000.. _realword_examples: Examples based on real world datasets ------------------------------------- Examples which use real-word dataset. imbalanced-learn-0.7.0/examples/applications/plot_impact_imbalanced_classes.py000066400000000000000000000344731366766276300300020ustar00rootroot00000000000000""" ========================================================== Fitting model on imbalanced datasets and how to fight bias ========================================================== This example illustrates the problem induced by learning on datasets having imbalanced classes. Subsequently, we compare different approaches alleviating these negative effects. """ # Authors: Guillaume Lemaitre # License: MIT print(__doc__) ############################################################################### # Problem definition ############################################################################### from sklearn.datasets import fetch_openml df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True) # we are dropping the following features: # - "fnlwgt": this feature was created while studying the "adult" dataset. # Thus, we will not use this feature which is not acquired during the survey. # - "education-num": it is encoding the same information than "education". # Thus, we are removing one of these 2 features. df = df.drop(columns=['fnlwgt', 'education-num']) ############################################################################### # The "adult" dataset as a class ratio of about 3:1 classes_count = y.value_counts() classes_count ############################################################################### # This dataset is only slightly imbalanced. To better highlight the effect of # learning from an imbalanced dataset, we will increase its ratio to 30:1 from imblearn.datasets import make_imbalance ratio = 30 df_res, y_res = make_imbalance( df, y, sampling_strategy={ classes_count.idxmin(): classes_count.max() // ratio } ) y_res.value_counts() ############################################################################### # For the rest of the notebook, we will make a single split to get training # and testing data. Note that you should use cross-validation to have an # estimate of the performance variation in practice. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( df_res, y_res, stratify=y_res, random_state=42 ) ############################################################################### # As a baseline, we could use a classifier which will always predict the # majority class independently of the features provided. from sklearn.dummy import DummyClassifier dummy_clf = DummyClassifier(strategy="most_frequent") score = dummy_clf.fit(X_train, y_train).score(X_test, y_test) print(f"Accuracy score of a dummy classifier: {score:.3f}") ############################################################################## # Instead of using the accuracy, we can use the balanced accuracy which will # take into account the balancing issue. from sklearn.metrics import balanced_accuracy_score y_pred = dummy_clf.predict(X_test) score = balanced_accuracy_score(y_test, y_pred) print(f"Balanced accuracy score of a dummy classifier: {score:.3f}") ############################################################################### # Strategies to learn from an imbalanced dataset ############################################################################### ############################################################################### # We will first define a helper function which will train a given model # and compute both accuracy and balanced accuracy. The results will be stored # in a dataframe import pandas as pd def evaluate_classifier(clf, df_scores, clf_name=None): from sklearn.pipeline import Pipeline if clf_name is None: if isinstance(clf, Pipeline): clf_name = clf[-1].__class__.__name__ else: clf_name = clf.__class__.__name__ acc = clf.fit(X_train, y_train).score(X_test, y_test) y_pred = clf.predict(X_test) bal_acc = balanced_accuracy_score(y_test, y_pred) clf_score = pd.DataFrame( {clf_name: [acc, bal_acc]}, index=['Accuracy', 'Balanced accuracy'] ) df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3) return df_scores # Let's define an empty dataframe to store the results df_scores = pd.DataFrame() ############################################################################### # Dummy baseline # .............. # # Before to train a real machine learning model, we can store the results # obtained with our `DummyClassifier`. df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") df_scores ############################################################################### # Linear classifier baseline # .......................... # # We will create a machine learning pipeline using a `LogisticRegression` # classifier. In this regard, we will need to one-hot encode the categorical # columns and standardized the numerical columns before to inject the data into # the `LogisticRegression` classifier. # # First, we define our numerical and categorical pipelines. from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import make_pipeline num_pipe = make_pipeline( StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True) ) cat_pipe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(handle_unknown="ignore") ) ############################################################################### # Then, we can create a preprocessor which will dispatch the categorical # columns to the categorical pipeline and the numerical columns to the # numerical pipeline import numpy as np from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector preprocessor_linear = ColumnTransformer( [("num-pipe", num_pipe, selector(dtype_include=np.number)), ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], n_jobs=2 ) ############################################################################### # Finally, we connect our preprocessor with our `LogisticRegression`. We can # then evaluate our model. from sklearn.linear_model import LogisticRegression lr_clf = make_pipeline( preprocessor_linear, LogisticRegression(max_iter=1000) ) df_scores = evaluate_classifier(lr_clf, df_scores, "LR") df_scores ############################################################################### # We can see that our linear model is learning slightly better than our dummy # baseline. However, it is impacted by the class imbalance. # # We can verify that something similar is happening with a tree-based model # such as `RandomForestClassifier`. With this type of classifier, we will not # need to scale the numerical data, and we will only need to ordinal encode the # categorical data. from sklearn.preprocessing import OrdinalEncoder from sklearn.ensemble import RandomForestClassifier cat_pipe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OrdinalEncoder() ) preprocessor_tree = ColumnTransformer( [("num-pipe", num_pipe, selector(dtype_include=np.number)), ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], n_jobs=2 ) rf_clf = make_pipeline( preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "RF") df_scores ############################################################################### # The `RandomForestClassifier` is as well affected by the class imbalanced, # slightly less than the linear model. Now, we will present different approach # to improve the performance of these 2 models. # # Use `class_weight` # .................. # # Most of the models in `scikit-learn` have a parameter `class_weight`. This # parameter will affect the computation of the loss in linear model or the # criterion in the tree-based model to penalize differently a false # classification from the minority and majority class. We can set # `class_weight="balanced"` such that the weight applied is inversely # proportional to the class frequency. We test this parametrization in both # linear model and tree-based model. lr_clf.set_params(logisticregression__class_weight="balanced") df_scores = evaluate_classifier( lr_clf, df_scores, "LR with class weight" ) df_scores ############################################################################### # rf_clf.set_params(randomforestclassifier__class_weight="balanced") df_scores = evaluate_classifier( rf_clf, df_scores, "RF with class weight" ) df_scores ############################################################################### # We can see that using `class_weight` was really effective for the linear # model, alleviating the issue of learning from imbalanced classes. However, # the `RandomForestClassifier` is still biased toward the majority class, # mainly due to the criterion which is not suited enough to fight the class # imbalance. # # Resample the training set during learning # ......................................... # # Another way is to resample the training set by under-sampling or # over-sampling some of the samples. `imbalanced-learn` provides some samplers # to do such processing. from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler from imblearn.under_sampling import RandomUnderSampler lr_clf = make_pipeline_with_sampler( preprocessor_linear, RandomUnderSampler(random_state=42), LogisticRegression(max_iter=1000) ) df_scores = evaluate_classifier( lr_clf, df_scores, "LR with under-sampling" ) df_scores ############################################################################### # rf_clf = make_pipeline_with_sampler( preprocessor_tree, RandomUnderSampler(random_state=42), RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier( rf_clf, df_scores, "RF with under-sampling" ) df_scores ############################################################################### # Applying a random under-sampler before the training of the linear model or # random forest, allows to not focus on the majority class at the cost of # making more mistake for samples in the majority class (i.e. decreased # accuracy). # # We could apply any type of samplers and find which sampler is working best # on the current dataset. # # Instead, we will present another way by using classifiers which will apply # sampling internally. # # Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier` # ....................................................................... # # We already showed that random under-sampling can be effective on decision # tree. However, instead of under-sampling once the dataset, one could # under-sample the original dataset before to take a bootstrap sample. This is # the base of the `BalancedRandomForestClassifier` and # `BalancedBaggingClassifier`. from imblearn.ensemble import BalancedRandomForestClassifier rf_clf = make_pipeline( preprocessor_tree, BalancedRandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF") df_scores ############################################################################### # The performance with the `BalancedRandomForestClassifier` is better than # applying a single random under-sampling. We will use a gradient-boosting # classifier within a `BalancedBaggingClassifier`. from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from imblearn.ensemble import BalancedBaggingClassifier bag_clf = make_pipeline( preprocessor_tree, BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier(random_state=42), n_estimators=10, random_state=42, n_jobs=2 ) ) df_scores = evaluate_classifier( bag_clf, df_scores, "Balanced bagging" ) df_scores ############################################################################### # This last approach is the most effective. The different under-sampling allows # to bring some diversity for the different GBDT to learn and not focus on a # portion of the majority class. # # We will repeat the same experiment but with a ratio of 100:1 and make a # similar analysis. ############################################################################### # Increase imbalanced ratio ############################################################################### ratio = 100 df_res, y_res = make_imbalance( df, y, sampling_strategy={ classes_count.idxmin(): classes_count.max() // ratio } ) X_train, X_test, y_train, y_test = train_test_split( df_res, y_res, stratify=y_res, random_state=42 ) df_scores = pd.DataFrame() df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") lr_clf = make_pipeline( preprocessor_linear, LogisticRegression(max_iter=1000) ) df_scores = evaluate_classifier(lr_clf, df_scores, "LR") rf_clf = make_pipeline( preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "RF") lr_clf.set_params(logisticregression__class_weight="balanced") df_scores = evaluate_classifier( lr_clf, df_scores, "LR with class weight" ) rf_clf.set_params(randomforestclassifier__class_weight="balanced") df_scores = evaluate_classifier( rf_clf, df_scores, "RF with class weight" ) lr_clf = make_pipeline_with_sampler( preprocessor_linear, RandomUnderSampler(random_state=42), LogisticRegression(max_iter=1000) ) df_scores = evaluate_classifier( lr_clf, df_scores, "LR with under-sampling" ) rf_clf = make_pipeline_with_sampler( preprocessor_tree, RandomUnderSampler(random_state=42), RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier( rf_clf, df_scores, "RF with under-sampling" ) rf_clf = make_pipeline( preprocessor_tree, BalancedRandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores) df_scores = evaluate_classifier( bag_clf, df_scores, "Balanced bagging" ) df_scores ############################################################################### # When we analyse the results, we can draw similar conclusions than in the # previous discussion. However, we can observe that the strategy # `class_weight="balanced"` does not improve the performance when using a # `RandomForestClassifier`. A resampling is indeed required. The most effective # method remains the `BalancedBaggingClassifier` using a GBDT as a base # learner. imbalanced-learn-0.7.0/examples/applications/plot_multi_class_under_sampling.py000066400000000000000000000027301366766276300302460ustar00rootroot00000000000000""" ============================================= Multiclass classification with under-sampling ============================================= Some balancing methods allow for balancing dataset with multiples classes. We provide an example to illustrate the use of those methods which do not differ from the binary case. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter from sklearn.datasets import load_iris from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss from imblearn.pipeline import make_pipeline from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 25, 1: 50, 2: 50}, random_state=RANDOM_STATE) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE) print('Training target statistics: {}'.format(Counter(y_train))) print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test))) imbalanced-learn-0.7.0/examples/applications/plot_over_sampling_benchmark_lfw.py000066400000000000000000000061401366766276300303660ustar00rootroot00000000000000""" ========================================================== Benchmark over-sampling methods in a face recognition task ========================================================== In this face recognition example two faces are used from the LFW (Faces in the Wild) dataset. Several implemented over-sampling methods are used in conjunction with a 3NN classifier in order to examine the improvement of the classifier's output quality by using an over-sampler. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT import matplotlib.pyplot as plt import numpy as np from scipy import interp from sklearn import datasets, neighbors from sklearn.metrics import auc, roc_curve from sklearn.model_selection import StratifiedKFold from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler from imblearn.pipeline import make_pipeline print(__doc__) LW = 2 RANDOM_STATE = 42 class DummySampler: def sample(self, X, y): return X, y def fit(self, X, y): return self def fit_resample(self, X, y): return self.sample(X, y) cv = StratifiedKFold(n_splits=3) # Load the dataset data = datasets.fetch_lfw_people() majority_person = 1871 # 530 photos of George W Bush minority_person = 531 # 29 photos of Bill Clinton majority_idxs = np.flatnonzero(data.target == majority_person) minority_idxs = np.flatnonzero(data.target == minority_person) idxs = np.hstack((majority_idxs, minority_idxs)) X = data.data[idxs] y = data.target[idxs] y[y == majority_person] = 0 y[y == minority_person] = 1 classifier = ['3NN', neighbors.KNeighborsClassifier(3)] samplers = [ ['Standard', DummySampler()], ['ADASYN', ADASYN(random_state=RANDOM_STATE)], ['ROS', RandomOverSampler(random_state=RANDOM_STATE)], ['SMOTE', SMOTE(random_state=RANDOM_STATE)], ] pipelines = [ ['{}-{}'.format(sampler[0], classifier[0]), make_pipeline(sampler[1], classifier[1])] for sampler in samplers ] fig = plt.figure() ax = fig.add_subplot(1, 1, 1) for name, pipeline in pipelines: mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for train, test in cv.split(X, y): probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f)'.format(name) % mean_auc, lw=LW) plt.plot([0, 1], [0, 1], linestyle='--', lw=LW, color='k', label='Luck') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() imbalanced-learn-0.7.0/examples/applications/plot_topic_classication.py000066400000000000000000000076731366766276300265250ustar00rootroot00000000000000""" ================================================= Example of topic classification in text documents ================================================= This example shows how to balance the text data before to train a classifier. Note that for this example, the data are slightly imbalanced but it can happen that for some data sets, the imbalanced ratio is more significant. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import make_pipeline as make_pipeline_imb from imblearn.metrics import classification_report_imbalanced print(__doc__) ############################################################################### # Setting the data set ############################################################################### ############################################################################### # We use a part of the 20 newsgroups data set by loading 4 topics. Using the # scikit-learn loader, the data are split into a training and a testing set. # # Note the class \#3 is the minority class and has almost twice less samples # than the majority class. categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target print('Training class distributions summary: {}'.format(Counter(y_train))) print('Test class distributions summary: {}'.format(Counter(y_test))) ############################################################################### # The usual scikit-learn pipeline ############################################################################### ############################################################################### # You might usually use scikit-learn pipeline by combining the TF-IDF # vectorizer to feed a multinomial naive bayes classifier. A classification # report summarized the results on the testing set. # # As expected, the recall of the class \#3 is low mainly due to the class # imbalanced. pipe = make_pipeline(TfidfVectorizer(), MultinomialNB()) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) print(classification_report_imbalanced(y_test, y_pred)) ############################################################################### # Balancing the class before classification ############################################################################### ############################################################################### # To improve the prediction of the class \#3, it could be interesting to apply # a balancing before to train the naive bayes classifier. Therefore, we will # use a ``RandomUnderSampler`` to equalize the number of samples in all the # classes before the training. # # It is also important to note that we are using the ``make_pipeline`` function # implemented in imbalanced-learn to properly handle the samplers. pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) ############################################################################### # Although the results are almost identical, it can be seen that the resampling # allowed to correct the poor recall of the class \#3 at the cost of reducing # the other metrics for the other classes. However, the overall results are # slightly better. print(classification_report_imbalanced(y_test, y_pred)) imbalanced-learn-0.7.0/examples/applications/porto_seguro_keras_under_sampling.py000066400000000000000000000207071366766276300306110ustar00rootroot00000000000000""" ========================================================== Porto Seguro: balancing samples in mini-batches with Keras ========================================================== This example compares two strategies to train a neural-network on the Porto Seguro Kaggle data set [1]_. The data set is imbalanced and we show that balancing each mini-batch allows to improve performance and reduce the training time. References ---------- .. [1] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data """ # Authors: Guillaume Lemaitre # License: MIT print(__doc__) ############################################################################### # Data loading ############################################################################### from collections import Counter import pandas as pd import numpy as np ############################################################################### # First, you should download the Porto Seguro data set from Kaggle. See the # link in the introduction. training_data = pd.read_csv('./input/train.csv') testing_data = pd.read_csv('./input/test.csv') y_train = training_data[['id', 'target']].set_index('id') X_train = training_data.drop(['target'], axis=1).set_index('id') X_test = testing_data.set_index('id') ############################################################################### # The data set is imbalanced and it will have an effect on the fitting. print('The data set is imbalanced: {}'.format(Counter(y_train['target']))) ############################################################################### # Define the pre-processing pipeline ############################################################################### from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import FunctionTransformer from sklearn.impute import SimpleImputer def convert_float64(X): return X.astype(np.float64) ############################################################################### # We want to standard scale the numerical features while we want to one-hot # encode the categorical features. In this regard, we make use of the # :class:`sklearn.compose.ColumnTransformer`. numerical_columns = [name for name in X_train.columns if '_calc_' in name and '_bin' not in name] numerical_pipeline = make_pipeline( FunctionTransformer(func=convert_float64, validate=False), StandardScaler()) categorical_columns = [name for name in X_train.columns if '_cat' in name] categorical_pipeline = make_pipeline( SimpleImputer(missing_values=-1, strategy='most_frequent'), OneHotEncoder(categories='auto')) preprocessor = ColumnTransformer( [('numerical_preprocessing', numerical_pipeline, numerical_columns), ('categorical_preprocessing', categorical_pipeline, categorical_columns)], remainder='drop') # Create an environment variable to avoid using the GPU. This can be changed. import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' ############################################################################### # Create a neural-network ############################################################################### from keras.models import Sequential from keras.layers import Activation, Dense, Dropout, BatchNormalization def make_model(n_features): model = Sequential() model.add(Dense(200, input_shape=(n_features,), kernel_initializer='glorot_normal')) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(100, kernel_initializer='glorot_normal', use_bias=False)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(50, kernel_initializer='glorot_normal', use_bias=False)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.15)) model.add(Dense(25, kernel_initializer='glorot_normal', use_bias=False)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.1)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model ############################################################################### # We create a decorator to report the computation time import time from functools import wraps def timeit(f): @wraps(f) def wrapper(*args, **kwds): start_time = time.time() result = f(*args, **kwds) elapsed_time = time.time() - start_time print('Elapsed computation time: {:.3f} secs' .format(elapsed_time)) return (elapsed_time, result) return wrapper ############################################################################### # The first model will be trained using the ``fit`` method and with imbalanced # mini-batches. from sklearn.metrics import roc_auc_score @timeit def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000) y_pred = model.predict_proba(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) ############################################################################### # In the contrary, we will use imbalanced-learn to create a generator of # mini-batches which will yield balanced mini-batches. from imblearn.keras import BalancedBatchGenerator @timeit def fit_predict_balanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) training_generator = BalancedBatchGenerator(X_train, y_train, batch_size=1000, random_state=42) model.fit_generator(generator=training_generator, epochs=5, verbose=1) y_pred = model.predict_proba(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) ############################################################################### # Classification loop ############################################################################### ############################################################################### # We will perform a 10-fold cross-validation and train the neural-network with # the two different strategies previously presented. from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=10) cv_results_imbalanced = [] cv_time_imbalanced = [] cv_results_balanced = [] cv_time_balanced = [] for train_idx, valid_idx in skf.split(X_train, y_train): X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) y_local_train = y_train.iloc[train_idx].values.ravel() X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) y_local_test = y_train.iloc[valid_idx].values.ravel() elapsed_time, roc_auc = fit_predict_imbalanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_imbalanced.append(elapsed_time) cv_results_imbalanced.append(roc_auc) elapsed_time, roc_auc = fit_predict_balanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_balanced.append(elapsed_time) cv_results_balanced.append(roc_auc) ############################################################################### # Plot of the results and computation time ############################################################################### df_results = (pd.DataFrame({'Balanced model': cv_results_balanced, 'Imbalanced model': cv_results_imbalanced}) .unstack().reset_index()) df_time = (pd.DataFrame({'Balanced model': cv_time_balanced, 'Imbalanced model': cv_time_imbalanced}) .unstack().reset_index()) import seaborn as sns import matplotlib.pyplot as plt plt.figure() sns.boxplot(y='level_0', x=0, data=df_time) sns.despine(top=True, right=True, left=True) plt.xlabel('time [s]') plt.ylabel('') plt.title('Computation time difference using a random under-sampling') plt.figure() sns.boxplot(y='level_0', x=0, data=df_results, whis=10.0) sns.despine(top=True, right=True, left=True) ax = plt.gca() ax.xaxis.set_major_formatter( plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x))) plt.xlabel('ROC-AUC') plt.ylabel('') plt.title('Difference in terms of ROC-AUC using a random under-sampling') imbalanced-learn-0.7.0/examples/combine/000077500000000000000000000000001366766276300201545ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/combine/README.txt000066400000000000000000000004261366766276300216540ustar00rootroot00000000000000.. _combine_examples: Examples using combine class methods ==================================== Combine methods mixed over- and under-sampling methods. Generally SMOTE is used for over-sampling while some cleaning methods (i.e., ENN and Tomek links) are used to under-sample. imbalanced-learn-0.7.0/examples/combine/plot_comparison_combine.py000066400000000000000000000104501366766276300254320ustar00rootroot00000000000000""" ==================================================================== Comparison of the combination of over- and under-sampling algorithms ==================================================================== This example shows the effect of applying an under-sampling algorithms after SMOTE over-sampling. In the literature, Tomek's link and edited nearest neighbours are the two methods which have been used and are available in imbalanced-learn. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_classification from sklearn.svm import LinearSVC from imblearn.pipeline import make_pipeline from imblearn.over_sampling import SMOTE from imblearn.combine import SMOTEENN, SMOTETomek print(__doc__) ############################################################################### # The following function will be used to create toy dataset. It using the # ``make_classification`` from scikit-learn but fixing some parameters. def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, class_sep=0.8, n_clusters=1): return make_classification(n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters, weights=list(weights), class_sep=class_sep, random_state=0) ############################################################################### # The following function will be used to plot the sample space after resampling # to illustrate the characteristic of an algorithm. def plot_resampling(X, y, sampling, ax): X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) return Counter(y_res) ############################################################################### # The following function will be used to plot the decision function of a # classifier given some data. def plot_decision_function(X, y, clf, ax): plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k') ############################################################################### # ``SMOTE`` allows to generate samples. However, this method of over-sampling # does not have any knowledge regarding the underlying distribution. Therefore, # some noisy samples can be generated, e.g. when the different classes cannot # be well separated. Hence, it can be beneficial to apply an under-sampling # algorithm to clean the noisy samples. Two methods are usually used in the # literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning # methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek`` # and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than # ``SMOTETomek``. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7)) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, ( SMOTE(random_state=0), SMOTEENN(random_state=0), SMOTETomek(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format( sampler.__class__.__name__)) fig.tight_layout() plt.show() imbalanced-learn-0.7.0/examples/datasets/000077500000000000000000000000001366766276300203505ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/datasets/README.txt000066400000000000000000000001721366766276300220460ustar00rootroot00000000000000.. _dataset_examples: Dataset examples ----------------------- Examples concerning the :mod:`imblearn.datasets` module. imbalanced-learn-0.7.0/examples/datasets/plot_make_imbalance.py000066400000000000000000000032521366766276300246720ustar00rootroot00000000000000""" ============================ Create an imbalanced dataset ============================ An illustration of the :func:`imblearn.datasets.make_imbalance` function to create an imbalanced dataset from a balanced dataset. We show the ability of :func:`imblearn.datasets.make_imbalance` of dealing with Pandas DataFrame. """ # Authors: Dayvid Oliveira # Christos Aridas # Guillaume Lemaitre # License: MIT from collections import Counter import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import make_moons from imblearn.datasets import make_imbalance print(__doc__) # Generate the dataset X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10) X = pd.DataFrame(X, columns=["feature 1", "feature 2"]) # Two subplots, unpack the axes array immediately f, axs = plt.subplots(2, 3) axs = [a for ax in axs for a in ax] X.plot.scatter( x='feature 1', y='feature 2', c=y, ax=axs[0], colormap='viridis', colorbar=False ) axs[0].set_title('Original set') def ratio_func(y, multiplier, minority_class): target_stats = Counter(y) return {minority_class: int(multiplier * target_stats[minority_class])} multipliers = [0.9, 0.75, 0.5, 0.25, 0.1] for i, multiplier in enumerate(multipliers, start=1): ax = axs[i] X_, y_ = make_imbalance(X, y, sampling_strategy=ratio_func, **{"multiplier": multiplier, "minority_class": 1}) X_.plot.scatter( x='feature 1', y='feature 2', c=y_, ax=ax, colormap='viridis', colorbar=False ) ax.set_title('Sampling ratio = {}'.format(multiplier)) plt.tight_layout() plt.show() imbalanced-learn-0.7.0/examples/ensemble/000077500000000000000000000000001366766276300203325ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/ensemble/README.txt000066400000000000000000000005511366766276300220310ustar00rootroot00000000000000.. _ensemble_examples: Example using ensemble class methods ==================================== Under-sampling methods implies that samples of the majority class are lost during the balancing procedure. Ensemble methods offer an alternative to use most of the samples. In fact, an ensemble of balanced sets is created and used to later train any classifier. imbalanced-learn-0.7.0/examples/ensemble/plot_comparison_ensemble_classifier.py000066400000000000000000000220571366766276300302000ustar00rootroot00000000000000""" ============================================================== Comparison of ensembling classifiers internally using sampling ============================================================== Ensembling classifiers have shown to improve classification performance compare to single learner. However, they will be affected by class imbalance. This example shows the benefit of balancing the training set before to learn learners. We are making the comparison with non-balanced ensemble methods. We make a comparison using the balanced accuracy and geometric mean which are metrics widely used in the literature to evaluate models learned on imbalanced set. """ # Authors: Guillaume Lemaitre # License: MIT import itertools import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import balanced_accuracy_score from imblearn.datasets import fetch_datasets from imblearn.ensemble import BalancedBaggingClassifier from imblearn.ensemble import BalancedRandomForestClassifier from imblearn.ensemble import EasyEnsembleClassifier from imblearn.ensemble import RUSBoostClassifier from imblearn.metrics import geometric_mean_score def plot_confusion_matrix(cm, classes, ax, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ print(cm) print('') ax.imshow(cm, interpolation='nearest', cmap=cmap) ax.set_title(title) tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.sca(ax) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): ax.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") ax.set_ylabel('True label') ax.set_xlabel('Predicted label') ############################################################################### # Load an imbalanced dataset ############################################################################### # We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1 # (number of majority sample for a minority sample). The data are then split # into training and testing. satimage = fetch_datasets()['satimage'] X, y = satimage.data, satimage.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) ############################################################################### # Classification using a single decision tree ############################################################################### # We train a decision tree classifier which will be used as a baseline for the # rest of this example. ############################################################################### # The results are reported in terms of balanced accuracy and geometric mean # which are metrics widely used in the literature to validate model trained on # imbalanced set. tree = DecisionTreeClassifier() tree.fit(X_train, y_train) y_pred_tree = tree.predict(X_test) print('Decision tree classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_tree), geometric_mean_score(y_test, y_pred_tree))) cm_tree = confusion_matrix(y_test, y_pred_tree) fig, ax = plt.subplots() plot_confusion_matrix(cm_tree, classes=np.unique(satimage.target), ax=ax, title='Decision tree') ############################################################################### # Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0], title='Bagging') print('Balanced Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bbc), geometric_mean_score(y_test, y_pred_bbc))) cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc) plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target), ax=ax[1], title='Balanced bagging') ############################################################################### # Classification using random forest classifier with and without sampling ############################################################################### # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. rf = RandomForestClassifier(n_estimators=50, random_state=0) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outsperforms the bagging classifier. print('Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rf), geometric_mean_score(y_test, y_pred_rf))) cm_rf = confusion_matrix(y_test, y_pred_rf) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0], title='Random forest') print('Balanced Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_brf), geometric_mean_score(y_test, y_pred_brf))) cm_brf = confusion_matrix(y_test, y_pred_brf) plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target), ax=ax[1], title='RUSBoost classifier') plt.show() imbalanced-learn-0.7.0/examples/evaluation/000077500000000000000000000000001366766276300207075ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/evaluation/README.txt000066400000000000000000000002221366766276300224010ustar00rootroot00000000000000.. _evaluation_examples: Evaluation examples ------------------- Examples illustrating how classification using imbalanced dataset can be done. imbalanced-learn-0.7.0/examples/evaluation/plot_classification_report.py000066400000000000000000000031701366766276300267060ustar00rootroot00000000000000""" ============================================= Evaluate classification by compiling a report ============================================= Specific metrics have been developed to evaluate classifier which has been trained using imbalanced data. `imblearn` provides a classification report similar to `sklearn`, with additional metrics specific to imbalanced learning problem. """ # Authors: Guillaume Lemaitre # License: MIT from sklearn import datasets from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn import over_sampling as os from imblearn import pipeline as pl from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Generate a dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) # Train the classifier with balancing pipeline.fit(X_train, y_train) # Test the classifier and get the prediction y_pred_bal = pipeline.predict(X_test) # Show the classification report print(classification_report_imbalanced(y_test, y_pred_bal)) imbalanced-learn-0.7.0/examples/evaluation/plot_metrics.py000066400000000000000000000050651366766276300237730ustar00rootroot00000000000000""" ======================================= Metrics specific to imbalanced learning ======================================= Specific metrics have been developed to evaluate classifier which has been trained using imbalanced data. `imblearn` provides mainly two additional metrics which are not implemented in `sklearn`: (i) geometric mean and (ii) index balanced accuracy. """ # Authors: Guillaume Lemaitre # License: MIT from sklearn import datasets from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn import over_sampling as os from imblearn import pipeline as pl from imblearn.metrics import (geometric_mean_score, make_index_balanced_accuracy) print(__doc__) RANDOM_STATE = 42 # Generate a dataset X, y = datasets.make_classification(n_classes=3, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) # Train the classifier with balancing pipeline.fit(X_train, y_train) # Test the classifier and get the prediction y_pred_bal = pipeline.predict(X_test) ############################################################################### # The geometric mean corresponds to the square root of the product of the # sensitivity and specificity. Combining the two metrics should account for # the balancing of the dataset. print('The geometric mean is {}'.format(geometric_mean_score( y_test, y_pred_bal))) ############################################################################### # The index balanced accuracy can transform any metric to be used in # imbalanced learning problems. alpha = 0.1 geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)( geometric_mean_score) print('The IBA using alpha = {} and the geometric mean: {}'.format( alpha, geo_mean( y_test, y_pred_bal))) alpha = 0.5 geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)( geometric_mean_score) print('The IBA using alpha = {} and the geometric mean: {}'.format( alpha, geo_mean( y_test, y_pred_bal))) imbalanced-learn-0.7.0/examples/model_selection/000077500000000000000000000000001366766276300217055ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/model_selection/README.txt000066400000000000000000000001701366766276300234010ustar00rootroot00000000000000.. _model_selection_examples: Model Selection --------------- Examples related to the selection of balancing methods. imbalanced-learn-0.7.0/examples/model_selection/plot_validation_curve.py000066400000000000000000000046701366766276300266620ustar00rootroot00000000000000""" ========================== Plotting Validation Curves ========================== In this example the impact of the SMOTE's k_neighbors parameter is examined. In the plot you can see the validation scores of a SMOTE-CART classifier for different values of the SMOTE's k_neighbors parameter. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT import matplotlib.pyplot as plt import numpy as np from sklearn import model_selection as ms from sklearn import datasets, metrics, tree from imblearn import over_sampling as os from imblearn import pipeline as pl print(__doc__) RANDOM_STATE = 42 scorer = metrics.make_scorer(metrics.cohen_kappa_score) # Generate the dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) smote = os.SMOTE(random_state=RANDOM_STATE) cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE) pipeline = pl.make_pipeline(smote, cart) param_range = range(1, 11) train_scores, test_scores = ms.validation_curve( pipeline, X, y, param_name="smote__k_neighbors", param_range=param_range, cv=3, scoring=scorer) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.plot(param_range, test_scores_mean, label='SMOTE') ax.fill_between(param_range, test_scores_mean + test_scores_std, test_scores_mean - test_scores_std, alpha=0.2) idx_max = np.argmax(test_scores_mean) plt.scatter(param_range[idx_max], test_scores_mean[idx_max], label=r'Cohen Kappa: ${:.2f}\pm{:.2f}$'.format( test_scores_mean[idx_max], test_scores_std[idx_max])) plt.title("Validation Curve with SMOTE-CART") plt.xlabel("k_neighbors") plt.ylabel("Cohen's kappa") # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) plt.xlim([1, 10]) plt.ylim([0.4, 0.8]) plt.legend(loc="best") plt.show() imbalanced-learn-0.7.0/examples/over-sampling/000077500000000000000000000000001366766276300213235ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/over-sampling/README.txt000066400000000000000000000003771366766276300230300ustar00rootroot00000000000000.. _over_sampling_examples: Example using over-sampling class methods ========================================= Data balancing can be performed by over-sampling such that new samples are generated in the minority class to reach a given balancing ratio. imbalanced-learn-0.7.0/examples/over-sampling/plot_comparison_over_sampling.py000066400000000000000000000245661366766276300300470ustar00rootroot00000000000000""" ==================================================== Comparison of the different over-sampling algorithms ==================================================== The following example attends to make a qualitative comparison between the different over-sampling algorithms available in the imbalanced-learn package. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_classification from sklearn.svm import LinearSVC from imblearn.pipeline import make_pipeline from imblearn.over_sampling import ADASYN from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, KMeansSMOTE) from imblearn.over_sampling import RandomOverSampler from imblearn.base import BaseSampler print(__doc__) ############################################################################### # The following function will be used to create toy dataset. It using the # ``make_classification`` from scikit-learn but fixing some parameters. def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, class_sep=0.8, n_clusters=1): return make_classification(n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters, weights=list(weights), class_sep=class_sep, random_state=0) ############################################################################### # The following function will be used to plot the sample space after resampling # to illustrate the characterisitic of an algorithm. def plot_resampling(X, y, sampling, ax): X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) return Counter(y_res) ############################################################################### # The following function will be used to plot the decision function of a # classifier given some data. def plot_decision_function(X, y, clf, ax): plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k') ############################################################################### # Illustration of the influence of the balancing ratio ############################################################################### ############################################################################### # We will first illustrate the influence of the balancing ratio on some toy # data using a linear SVM classifier. Greater is the difference between the # number of samples in each class, poorer are the classfication results. fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) ax_arr = (ax1, ax2, ax3, ax4) weights_arr = ((0.01, 0.01, 0.98), (0.01, 0.05, 0.94), (0.2, 0.1, 0.7), (0.33, 0.33, 0.33)) for ax, weights in zip(ax_arr, weights_arr): X, y = create_dataset(n_samples=1000, weights=weights) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax) ax.set_title('Linear SVC with y={}'.format(Counter(y))) fig.tight_layout() ############################################################################### # Random over-sampling to balance the data set ############################################################################### ############################################################################### # Random over-sampling can be used to repeat some samples and balance the # number of samples between the dataset. It can be seen that with this trivial # approach the boundary decision is already less biaised toward the majority # class. fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7)) X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94)) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) pipe = make_pipeline(RandomOverSampler(random_state=0), LinearSVC()) pipe.fit(X, y) plot_decision_function(X, y, pipe, ax2) ax2.set_title('Decision function for RandomOverSampler') fig.tight_layout() ############################################################################### # More advanced over-sampling using ADASYN and SMOTE ############################################################################### ############################################################################### # Instead of repeating the same samples when over-sampling, we can use some # specific heuristic instead. ADASYN and SMOTE can be used in this case. # Make an identity sampler class FakeSampler(BaseSampler): _sampling_type = 'bypass' def _fit_resample(self, X, y): return X, y fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15)) X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94)) sampler = FakeSampler() clf = make_pipeline(sampler, LinearSVC()) plot_resampling(X, y, sampler, ax1) ax1.set_title('Original data - y={}'.format(Counter(y))) ax_arr = (ax2, ax3, ax4) for ax, sampler in zip(ax_arr, (RandomOverSampler(random_state=0), SMOTE(random_state=0), ADASYN(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_resampling(X, y, sampler, ax) ax.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # The following plot illustrate the difference between ADASYN and SMOTE. ADASYN # will focus on the samples which are difficult to classify with a # nearest-neighbors rule while regular SMOTE will not make any distinction. # Therefore, the decision function depending of the algorithm. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94)) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = SMOTE() clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) sampler = ADASYN() clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax3) ax3.set_title('Decision function for {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # Due to those sampling particularities, it can give rise to some specific # issues as illustrated below. fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4)) for ax, sampler in zip(ax_arr, (SMOTE(random_state=0), ADASYN(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format( sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # SMOTE proposes several variants by identifying specific samples to consider # during the resampling. The borderline version will detect which point to # select which are in the border between two classes. The SVM version will use # the support vectors found using an SVM algorithm to create new sample while # the KMeans version will make a clustering before to generate samples in each # cluster independently depending each cluster density. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(5, 2, figsize=(15, 30)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) for ax, sampler in zip(ax_arr, (SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind='borderline-1'), BorderlineSMOTE(random_state=0, kind='borderline-2'), KMeansSMOTE(random_state=0), SVMSMOTE(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # When dealing with a mixed of continuous and categorical features, SMOTE-NC # is the only method which can handle this case. # create a synthetic data set with continuous and categorical features rng = np.random.RandomState(42) n_samples = 50 X = np.empty((n_samples, 3), dtype=object) X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object) X[:, 1] = rng.randn(n_samples) X[:, 2] = rng.randint(3, size=n_samples) y = np.array([0] * 20 + [1] * 30) print('The original imbalanced dataset') print(sorted(Counter(y).items())) print('The first and last columns are containing categorical features:') print(X[:5]) smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(X, y) print('Dataset after resampling:') print(sorted(Counter(y_resampled).items())) print('SMOTE-NC will generate categories for the categorical features:') print(X_resampled[-5:]) plt.show() imbalanced-learn-0.7.0/examples/over-sampling/plot_illustration_generation_sample.py000066400000000000000000000043401366766276300312410ustar00rootroot00000000000000""" ==================================================================== Illustration of the sample generation in the over-sampling algorithm ==================================================================== This example illustrates how a new sample is generated taking into account the neighbourhood of this sample. A new sample is generated by selecting the randomly 2 samples of the same classe and interpolating a point between these samples. """ # Authors: Guillaume Lemaitre # License: MIT import matplotlib.pyplot as plt import numpy as np print(__doc__) rng = np.random.RandomState(18) f, ax = plt.subplots(1, 1, figsize=(8, 8)) # generate some data points y = np.array([3.65284, 3.52623, 3.51468, 3.22199, 3.21]) z = np.array([0.43, 0.45, 0.6, 0.4, 0.211]) y_2 = np.array([3.3, 3.6]) z_2 = np.array([0.58, 0.34]) # plot the majority and minority samples ax.scatter(z, y, label='Minority class', s=100) ax.scatter(z_2, y_2, label='Majority class', s=100) idx = rng.randint(len(y), size=2) annotation = [r'$x_i$', r'$x_{zi}$'] for a, i in zip(annotation, idx): ax.annotate(a, (z[i], y[i]), xytext=tuple([z[i] + 0.01, y[i] + 0.005]), fontsize=15) # draw the circle in which the new sample will generated radius = np.sqrt((z[idx[0]] - z[idx[1]]) ** 2 + (y[idx[0]] - y[idx[1]]) ** 2) circle = plt.Circle((z[idx[0]], y[idx[0]]), radius=radius, alpha=0.2) ax.add_artist(circle) # plot the line on which the sample will be generated ax.plot(z[idx], y[idx], '--', alpha=0.5) # create and plot the new sample step = rng.uniform() y_gen = y[idx[0]] + step * (y[idx[1]] - y[idx[0]]) z_gen = z[idx[0]] + step * (z[idx[1]] - z[idx[0]]) ax.scatter(z_gen, y_gen, s=100) ax.annotate(r'$x_{new}$', (z_gen, y_gen), xytext=tuple([z_gen + 0.01, y_gen + 0.005]), fontsize=15) # make the plot nicer with legend and label ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([0.2, 0.7]) ax.set_ylim([3.2, 3.7]) plt.xlabel(r'$X_1$') plt.ylabel(r'$X_2$') plt.legend() plt.tight_layout() plt.show() imbalanced-learn-0.7.0/examples/pipeline/000077500000000000000000000000001366766276300203455ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/pipeline/README.txt000066400000000000000000000002331366766276300220410ustar00rootroot00000000000000.. _pipeline_examples: Pipeline examples ================= Example of how to use the a pipeline to include under-sampling with `scikit-learn` estimators.imbalanced-learn-0.7.0/examples/pipeline/plot_pipeline_classification.py000066400000000000000000000027601366766276300266420ustar00rootroot00000000000000""" ========================= Pipeline Object ========================= An example of the Pipeline object working with transformers and resamplers. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT from sklearn.model_selection import train_test_split as tts from sklearn.datasets import make_classification from sklearn.decomposition import PCA from sklearn.metrics import classification_report from sklearn.neighbors import KNeighborsClassifier as KNN from imblearn.pipeline import make_pipeline from imblearn.under_sampling import (EditedNearestNeighbours, RepeatedEditedNearestNeighbours) print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Create the samplers enn = EditedNearestNeighbours() renn = RepeatedEditedNearestNeighbours() # Create the classifier knn = KNN(1) # Make the splits X_train, X_test, y_train, y_test = tts(X, y, random_state=42) # Add one transformers and two samplers in the pipeline object pipeline = make_pipeline(pca, enn, renn, knn) pipeline.fit(X_train, y_train) y_hat = pipeline.predict(X_test) print(classification_report(y_test, y_hat)) imbalanced-learn-0.7.0/examples/plot_outlier_rejections.py000066400000000000000000000110301366766276300240530ustar00rootroot00000000000000""" =============================================================== Customized sampler to implement an outlier rejections estimator =============================================================== This example illustrates the use of a custom sampler to implement an outlier rejections estimator. It can be used easily within a pipeline in which the number of samples can vary during training, which usually is a limitation of the current scikit-learn pipeline. """ # Authors: Guillaume Lemaitre # License: MIT import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_moons, make_blobs from sklearn.ensemble import IsolationForest from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from imblearn import FunctionSampler from imblearn.pipeline import make_pipeline print(__doc__) rng = np.random.RandomState(42) def plot_scatter(X, y, title): """Function to plot some data as a scatter plot.""" plt.figure() plt.scatter(X[y == 1, 0], X[y == 1, 1], label='Class #1') plt.scatter(X[y == 0, 0], X[y == 0, 1], label='Class #0') plt.legend() plt.title(title) ############################################################################## # Toy data generation ############################################################################## ############################################################################## # We are generating some non Gaussian data set contaminated with some unform # noise. moons, _ = make_moons(n_samples=500, noise=0.05) blobs, _ = make_blobs(n_samples=500, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25) outliers = rng.uniform(low=-3, high=3, size=(500, 2)) X_train = np.vstack([moons, blobs, outliers]) y_train = np.hstack([np.ones(moons.shape[0], dtype=np.int8), np.zeros(blobs.shape[0], dtype=np.int8), rng.randint(0, 2, size=outliers.shape[0], dtype=np.int8)]) plot_scatter(X_train, y_train, 'Training dataset') ############################################################################## # We will generate some cleaned test data without outliers. moons, _ = make_moons(n_samples=50, noise=0.05) blobs, _ = make_blobs(n_samples=50, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25) X_test = np.vstack([moons, blobs]) y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8), np.zeros(blobs.shape[0], dtype=np.int8)]) plot_scatter(X_test, y_test, 'Testing dataset') ############################################################################## # How to use the :class:`imblearn.FunctionSampler` ############################################################################## ############################################################################## # We first define a function which will use # :class:`sklearn.ensemble.IsolationForest` to eliminate some outliers from # our dataset during training. The function passed to the # :class:`imblearn.FunctionSampler` will be called when using the method # ``fit_resample``. def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') ############################################################################## # Integrate it within a pipeline ############################################################################## ############################################################################## # By elimnating outliers before the training, the classifier will be less # affected during the prediction. pipe = make_pipeline(FunctionSampler(func=outlier_rejection), LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng) y_pred = clf.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) plt.show() imbalanced-learn-0.7.0/examples/plot_sampling_strategy_usage.py000066400000000000000000000200011366766276300250610ustar00rootroot00000000000000""" ========================================================================= Usage of the ``sampling_strategy`` parameter for the different algorithms ========================================================================= This example shows the different usage of the parameter ``sampling_strategy`` for the different family of samplers (i.e. over-sampling, under-sampling. or cleaning methods). """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.under_sampling import TomekLinks print(__doc__) def plot_pie(y): target_stats = Counter(y) labels = list(target_stats.keys()) sizes = list(target_stats.values()) explode = tuple([0.1] * len(target_stats)) def make_autopct(values): def my_autopct(pct): total = sum(values) val = int(round(pct * total / 100.0)) return '{p:.2f}% ({v:d})'.format(p=pct, v=val) return my_autopct fig, ax = plt.subplots() ax.pie(sizes, explode=explode, labels=labels, shadow=True, autopct=make_autopct(sizes)) ax.axis('equal') ############################################################################### # First, we will create an imbalanced data set from a the iris data set. iris = load_iris() print('Information of the original iris data set: \n {}'.format( Counter(iris.target))) plot_pie(iris.target) sampling_strategy = {0: 10, 1: 20, 2: 47} X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy) print('Information of the iris data set after making it' ' imbalanced using a dict: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y))) plot_pie(y) ############################################################################### # Using ``sampling_strategy`` in resampling algorithms ############################################################################### ############################################################################### # ``sampling_strategy`` as a ``float`` # .................................... # # ``sampling_strategy`` can be given a ``float``. For **under-sampling # methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by # :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and # :math:`N_{m}` are the number of samples in the majority class after # resampling and the number of samples in the minority class, respectively. # select only 2 classes since the ratio make sense in this case binary_mask = np.bitwise_or(y == 0, y == 2) binary_y = y[binary_mask] binary_X = X[binary_mask] sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an over-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` has a ``str`` # ................................... # # ``sampling_strategy`` can be given as a string which specify the class # targeted by the resampling. With under- and over-sampling, the number of # samples will be equalized. # # Note that we are using multiple classes from now on. sampling_strategy = 'not minority' rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) sampling_strategy = 'not majority' ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # With **cleaning method**, the number of samples in each class will not be # equalized even if targeted. sampling_strategy = 'not minority' tl = TomekLinks(sampling_strategy) X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a ``dict`` # ................................... # # When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted # classes. The values correspond to the desired number of samples for each # targeted class. This is working for both **under- and over-sampling** # algorithms but not for the **cleaning algorithms**. Use a ``list`` instead. sampling_strategy = {0: 10, 1: 15, 2: 20} rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) sampling_strategy = {0: 25, 1: 35, 2: 47} ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a ``list`` # ................................... # # When ``sampling_strategy`` is a ``list``, the list contains the targeted # classes. It is used only for **cleaning methods** and raise an error # otherwise. sampling_strategy = [0, 1, 2] tl = TomekLinks(sampling_strategy=sampling_strategy) X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a callable # ................................... # # When callable, function taking ``y`` and returns a ``dict``. The keys # correspond to the targeted classes. The values correspond to the desired # number of samples for each class. def ratio_multiplier(y): multiplier = {1: 0.7, 2: 0.95} target_stats = Counter(y) for key, value in target_stats.items(): if key in multiplier: target_stats[key] = int(value * multiplier[key]) return target_stats X_res, y_res = (RandomUnderSampler(sampling_strategy=ratio_multiplier) .fit_resample(X, y)) print('Information of the iris data set after balancing using a callable' ' mode:\n ratio={} \n y: {}'.format(ratio_multiplier, Counter(y_res))) plot_pie(y_res) plt.show() imbalanced-learn-0.7.0/examples/under-sampling/000077500000000000000000000000001366766276300214655ustar00rootroot00000000000000imbalanced-learn-0.7.0/examples/under-sampling/README.txt000066400000000000000000000005121366766276300231610ustar00rootroot00000000000000.. _under_sampling_examples: Example using under-sampling class methods ========================================== Under-sampling refers to the process of reducing the number of samples in the majority classes. The implemented methods can be categorized into 2 groups: (i) fixed under-sampling and (ii) cleaning under-sampling. imbalanced-learn-0.7.0/examples/under-sampling/plot_comparison_under_sampling.py000066400000000000000000000254341366766276300303460ustar00rootroot00000000000000""" ===================================================== Comparison of the different under-sampling algorithms ===================================================== The following example attends to make a qualitative comparison between the different under-sampling algorithms available in the imbalanced-learn package. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_classification from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from imblearn.pipeline import make_pipeline from imblearn.under_sampling import (ClusterCentroids, RandomUnderSampler, NearMiss, InstanceHardnessThreshold, CondensedNearestNeighbour, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NeighbourhoodCleaningRule, OneSidedSelection) print(__doc__) ############################################################################### # The following function will be used to create toy dataset. It using the # ``make_classification`` from scikit-learn but fixing some parameters. def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, class_sep=0.8, n_clusters=1): return make_classification(n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters, weights=list(weights), class_sep=class_sep, random_state=0) ############################################################################### # The following function will be used to plot the sample space after resampling # to illustrate the characteristic of an algorithm. def plot_resampling(X, y, sampling, ax): X_res, y_res = sampling.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) return Counter(y_res) ############################################################################### # The following function will be used to plot the decision function of a # classifier given some data. def plot_decision_function(X, y, clf, ax): plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k') ############################################################################### # Prototype generation: under-sampling by generating new samples ############################################################################### ############################################################################### # ``ClusterCentroids`` under-samples by replacing the original samples by the # centroids of the cluster found. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = ClusterCentroids(random_state=0) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax3) ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # Prototype selection: under-sampling by selecting existing samples ############################################################################### ############################################################################### # The algorithm performing prototype selection can be subdivided into two # groups: (i) the controlled under-sampling methods and (ii) the cleaning # under-sampling methods. ############################################################################### # With the controlled under-sampling methods, the number of samples to be # selected can be specified. ``RandomUnderSampler`` is the most naive way of # performing such selection by randomly selecting a given number of samples by # the targetted class. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = RandomUnderSampler(random_state=0) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax3) ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # ``NearMiss`` algorithms implement some heuristic rules in order to select # samples. NearMiss-1 selects samples from the majority class for which the # average distance of the :math:`k`` nearest samples of the minority class is # the smallest. NearMiss-2 selects the samples from the majority class for # which the average distance to the farthest samples of the negative class is # the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority # sample, their ::math:`m` nearest-neighbors will be kept; then, the majority # samples selected are the on for which the average distance to the :math:`k` # nearest neighbors is the largest. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=5000, weights=(0.1, 0.2, 0.7), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, (NearMiss(version=1), NearMiss(version=2), NearMiss(version=3))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}-{}'.format( sampler.__class__.__name__, sampler.version)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}-{}'.format( sampler.__class__.__name__, sampler.version)) fig.tight_layout() ############################################################################### # ``EditedNearestNeighbours`` removes samples of the majority class for which # their class differ from the one of their nearest-neighbors. This sieve can be # repeated which is the principle of the # ``RepeatedEditedNearestNeighbours``. ``AllKNN`` is slightly different from # the ``RepeatedEditedNearestNeighbours`` by changing the :math:`k` parameter # of the internal nearest neighors algorithm, increasing it at each iteration. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, ( EditedNearestNeighbours(), RepeatedEditedNearestNeighbours(), AllKNN(allow_minority=True))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format( sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # ``CondensedNearestNeighbour`` makes use of a 1-NN to iteratively decide if a # sample should be kept in a dataset or not. The issue is that # ``CondensedNearestNeighbour`` is sensitive to noise by preserving the noisy # samples. ``OneSidedSelection`` also used the 1-NN and use ``TomekLinks`` to # remove the samples considered noisy. The ``NeighbourhoodCleaningRule`` use a # ``EditedNearestNeighbours`` to remove some sample. Additionally, they use a 3 # nearest-neighbors to remove samples which do not agree with this rule. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, ( CondensedNearestNeighbour(random_state=0), OneSidedSelection(random_state=0), NeighbourhoodCleaningRule())): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format( sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude # samples. All samples which are classified with a low probability will be # removed. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = InstanceHardnessThreshold( random_state=0, estimator=LogisticRegression(solver='lbfgs', multi_class='auto')) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax3) ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() plt.show() imbalanced-learn-0.7.0/examples/under-sampling/plot_illustration_nearmiss.py000066400000000000000000000145151366766276300275350ustar00rootroot00000000000000""" ========================================================================== Illustration of the sample selection for the different NearMiss algorithms ========================================================================== This example illustrates the different way of selecting example in NearMiss. """ import matplotlib.pyplot as plt import numpy as np from sklearn.neighbors import NearestNeighbors print(__doc__) rng = np.random.RandomState(18) ############################################################################### # This function allows to make nice plotting def make_plot_despine(ax): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([0., 3.5]) ax.set_ylim([0., 3.5]) ax.set_xlabel(r'$X_1$') ax.set_ylabel(r'$X_2$') ax.legend() ############################################################################### # We can start by generating some data to later illustrate the principle of # each NearMiss heuritic rules. # minority class X_minority = np.transpose([[1.1, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [1., 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]]) # majority class X_majority = np.transpose([[2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9]]) ############################################################################### # NearMiss-1 ############################################################################### ############################################################################### # NearMiss-1 selects samples from the majority class for which the average # distance to some nearest neighbours is the smallest. In the following # example, we use a 3-NN to compute the average distance on 2 specific samples # of the majority class. Therefore, in this case the point linked by the # green-dashed line will be selected since the average distance is smaller. fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X_minority[:, 0], X_minority[:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_majority[:, 0], X_majority[:, 1], label='Majority class', s=200, marker='+') nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ['g', 'r'])): for make_plot, sample_idx in enumerate(neighbors): ax.plot([X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], '--' + color, alpha=0.3, label='Avg. dist.={:.2f}'.format(distance) if make_plot == 0 else "") ax.set_title('NearMiss-1') make_plot_despine(ax) ############################################################################### # NearMiss-2 ############################################################################### ############################################################################### # NearMiss-2 selects samples from the majority class for which the average # distance to the farthest neighbors is the smallest. With the same # configuration as previously presented, the sample linked to the green-dashed # line will be selected since its distance the 3 farthest neighbors is the # smallest. fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X_minority[:, 0], X_minority[:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_majority[:, 0], X_majority[:, 1], label='Majority class', s=200, marker='+') nearest_neighbors = NearestNeighbors(n_neighbors=X_minority.shape[0]) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist = dist[:, -3::] ind = ind[:, -3::] dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ['g', 'r'])): for make_plot, sample_idx in enumerate(neighbors): ax.plot([X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], '--' + color, alpha=0.3, label='Avg. dist.={:.2f}'.format(distance) if make_plot == 0 else "") ax.set_title('NearMiss-2') make_plot_despine(ax) ############################################################################### # NearMiss-3 ############################################################################### ############################################################################### # NearMiss-3 can be divided into 2 steps. First, a nearest-neighbors is used to # short-list samples from the majority class (i.e. correspond to the # highlighted samples in the following plot). Then, the sample with the largest # average distance to the *k* nearest-neighbors are selected. fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X_minority[:, 0], X_minority[:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_majority[:, 0], X_majority[:, 1], label='Majority class', s=200, marker='+') nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_majority) # select only the majority point of interest selected_idx = nearest_neighbors.kneighbors(X_minority, return_distance=False) X_majority = X_majority[np.unique(selected_idx), :] ax.scatter(X_majority[:, 0], X_majority[:, 1], label='Short-listed samples', s=200, alpha=0.3, color='g') nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ['r', 'g'])): for make_plot, sample_idx in enumerate(neighbors): ax.plot([X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], '--' + color, alpha=0.3, label='Avg. dist.={:.2f}'.format(distance) if make_plot == 0 else "") ax.set_title('NearMiss-3') make_plot_despine(ax) fig.tight_layout() plt.show() imbalanced-learn-0.7.0/examples/under-sampling/plot_illustration_tomek_links.py000066400000000000000000000067171366766276300302400ustar00rootroot00000000000000""" ============================================== Illustration of the definition of a Tomek link ============================================== This example illustrates what is a Tomek link. """ import matplotlib.pyplot as plt import numpy as np from imblearn.under_sampling import TomekLinks print(__doc__) rng = np.random.RandomState(18) ############################################################################### # This function allows to make nice plotting def make_plot_despine(ax): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([0., 3.5]) ax.set_ylim([0., 3.5]) ax.set_xlabel(r'$X_1$') ax.set_ylabel(r'$X_2$') ax.legend() ############################################################################### # Generate some data with one Tomek link # minority class X_minority = np.transpose([[1.1, 1.3, 1.15, 0.8, 0.55, 2.1], [1., 1.5, 1.7, 2.5, 0.55, 1.9]]) # majority class X_majority = np.transpose([[2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9]]) ############################################################################### # In the figure above, the samples highlighted in green form a Tomek link since # they are of different classes and are nearest neighbours of each other. fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X_minority[:, 0], X_minority[:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_majority[:, 0], X_majority[:, 1], label='Majority class', s=200, marker='+') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title('Illustration of a Tomek link') make_plot_despine(ax) fig.tight_layout() ############################################################################### # We can run the ``TomekLinks`` sampling to remove the corresponding # samples. If ``sampling_strategy='auto'`` only the sample from the majority # class will be removed. If ``sampling_strategy='all'`` both samples will be # removed. sampler = TomekLinks() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) ax_arr = (ax1, ax2) title_arr = ('Removing only majority samples', 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, [TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all')]): X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0])) ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], label='Majority class', s=200, marker='+') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title(title) make_plot_despine(ax) fig.tight_layout() plt.show() imbalanced-learn-0.7.0/imblearn/000077500000000000000000000000001366766276300165135ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/__init__.py000066400000000000000000000062161366766276300206310ustar00rootroot00000000000000"""Toolbox for imbalanced dataset in machine learning. ``imbalanced-learn`` is a set of python methods to deal with imbalanced datset in machine learning and pattern recognition. Subpackages ----------- combine Module which provides methods based on over-sampling and under-sampling. ensemble Module which provides methods generating an ensemble of under-sampled subsets. exceptions Module including custom warnings and error clases used across imbalanced-learn. keras Module which provides custom generator, layers for deep learning using keras. metrics Module which provides metrics to quantified the classification performance with imbalanced dataset. over_sampling Module which provides methods to over-sample a dataset. tensorflow Module which provides custom generator, layers for deep learning using tensorflow. under-sampling Module which provides methods to under-sample a dataset. utils Module including various utilities. pipeline Module which allowing to create pipeline with scikit-learn estimators. """ import importlib import types from . import combine from . import ensemble from . import exceptions from . import metrics from . import over_sampling from . import tensorflow from . import under_sampling from . import utils from . import pipeline from .base import FunctionSampler from ._version import __version__ from .utils._show_versions import show_versions # # FIXME: When we get Python 3.7 as minimal version, we will need to switch to # # the following solution: # # https://snarky.ca/lazy-importing-in-python-3-7/ class LazyLoader(types.ModuleType): """Lazily import a module, mainly to avoid pulling in large dependencies. Adapted from TensorFlow: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/ python/util/lazy_loader.py """ def __init__(self, local_name, parent_module_globals, name, warning=None): self._local_name = local_name self._parent_module_globals = parent_module_globals self._warning = warning super(LazyLoader, self).__init__(name) def _load(self): """Load the module and insert it into the parent's globals.""" # Import the target module and insert it into the parent's namespace module = importlib.import_module(self.__name__) self._parent_module_globals[self._local_name] = module # Update this object's dict so that if someone keeps a reference to the # LazyLoader, lookups are efficient (__getattr__ is only called on # lookups that fail). self.__dict__.update(module.__dict__) return module def __getattr__(self, item): module = self._load() return getattr(module, item) def __dir__(self): module = self._load() return dir(module) # delay the import of keras since we are going to import either tensorflow # or keras keras = LazyLoader("keras", globals(), "imblearn.keras") __all__ = [ "combine", "ensemble", "exceptions", "keras", "metrics", "over_sampling", "tensorflow", "under_sampling", "utils", "pipeline", "FunctionSampler", "__version__", ] imbalanced-learn-0.7.0/imblearn/_version.py000066400000000000000000000011571366766276300207150ustar00rootroot00000000000000""" ``imbalanced-learn`` is a set of python methods to deal with imbalanced datset in machine learning and pattern recognition. """ # Based on NiLearn package # License: simplified BSD # PEP0440 compatible formatted version, see: # https://www.python.org/dev/peps/pep-0440/ # # Generic release markers: # X.Y # X.Y.Z # For bugfix releases # # Admissible pre-release markers: # X.YaN # Alpha release # X.YbN # Beta release # X.YrcN # Release Candidate # X.Y # Final release # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # __version__ = "0.7.0" imbalanced-learn-0.7.0/imblearn/base.py000066400000000000000000000205041366766276300200000ustar00rootroot00000000000000"""Base class for sampling""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from abc import ABCMeta, abstractmethod import numpy as np from sklearn.base import BaseEstimator from sklearn.preprocessing import label_binarize from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type from .utils._validation import ArraysTransformer from .utils._validation import _deprecate_positional_args class SamplerMixin(BaseEstimator, metaclass=ABCMeta): """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes instead. """ _estimator_type = "sampler" def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ X, y, _ = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) return self def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, dataframe, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ check_classification_targets(y) arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) output = self._fit_resample(X, y) y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility fit_sample = fit_resample @abstractmethod def _fit_resample(self, X, y): """Base method defined in each sampler to defined the sampling strategy. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray of shape (n_samples_new,) The corresponding label of `X_resampled`. """ pass class BaseSampler(SamplerMixin): """Base class for sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, accept_sparse=accept_sparse ) return X, y, binarize_y def _identity(X, y): return X, y class FunctionSampler(BaseSampler): """Construct a sampler from calling an arbitrary callable. Read more in the :ref:`User Guide `. Parameters ---------- func : callable, default=None The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity function. accept_sparse : bool, default=True Whether sparse input are supported. By default, sparse inputs are supported. kw_args : dict, default=None The keyword argument expected by ``func``. validate : bool, default=True Whether or not to bypass the validation of ``X`` and ``y``. Turning-off validation allows to use the ``FunctionSampler`` with any type of data. See Also -------- sklearn.preprocessing.FunctionTransfomer : Stateless transformer. Notes ----- See :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py` Examples -------- >>> import numpy as np >>> from sklearn.datasets import make_classification >>> from imblearn import FunctionSampler >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) We can create to select only the first ten samples for instance. >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) True We can also create a specific function which take some arguments. >>> from collections import Counter >>> from imblearn.under_sampling import RandomUnderSampler >>> def func(X, y, sampling_strategy, random_state): ... return RandomUnderSampler( ... sampling_strategy=sampling_strategy, ... random_state=random_state).fit_resample(X, y) >>> sampler = FunctionSampler(func=func, ... kw_args={'sampling_strategy': 'auto', ... 'random_state': 0}) >>> X_res, y_res = sampler.fit_resample(X, y) >>> print('Resampled dataset shape {}'.format( ... sorted(Counter(y_res).items()))) Resampled dataset shape [(0, 100), (1, 100)] """ _sampling_type = "bypass" @_deprecate_positional_args def __init__(self, *, func=None, accept_sparse=True, kw_args=None, validate=True): super().__init__() self.func = func self.accept_sparse = accept_sparse self.kw_args = kw_args self.validate = validate def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ arrays_transformer = ArraysTransformer(X, y) if self.validate: check_classification_targets(y) X, y, binarize_y = self._check_X_y( X, y, accept_sparse=self.accept_sparse ) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) output = self._fit_resample(X, y) if self.validate: y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) return output def _fit_resample(self, X, y): func = _identity if self.func is None else self.func output = func(X, y, **(self.kw_args if self.kw_args else {})) return output imbalanced-learn-0.7.0/imblearn/combine/000077500000000000000000000000001366766276300201275ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/combine/__init__.py000066400000000000000000000003211366766276300222340ustar00rootroot00000000000000"""The :mod:`imblearn.combine` provides methods which combine over-sampling and under-sampling. """ from ._smote_enn import SMOTEENN from ._smote_tomek import SMOTETomek __all__ = ["SMOTEENN", "SMOTETomek"] imbalanced-learn-0.7.0/imblearn/combine/_smote_enn.py000066400000000000000000000113201366766276300226240ustar00rootroot00000000000000"""Class to perform over-sampling using SMOTE and cleaning using ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from sklearn.base import clone from sklearn.utils import check_X_y from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTEENN(BaseSampler): """Over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} smote : object, default=None The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`imblearn.over_sampling.SMOTE` object with default parameters will be given. enn : object, default=None The :class:`imblearn.under_sampling.EditedNearestNeighbours` object to use. If not given, a :class:`imblearn.under_sampling.EditedNearestNeighbours` object with sampling strategy='all' will be given. {n_jobs} See Also -------- SMOTETomek : Over-sample using SMOTE followed by under-sampling removing the Tomek's links. Notes ----- The method is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and ENN regarding the scheme which used. References ---------- .. [1] G. Batista, R. C. Prati, M. C. Monard. "A study of the behavior of several methods for balancing machine learning training data," ACM Sigkdd Explorations Newsletter 6 (1), 20-29, 2004. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import SMOTEENN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sme = SMOTEENN(random_state=42) >>> X_res, y_res = sme.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 881}}) """ _sampling_type = "over-sampling" @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, smote=None, enn=None, n_jobs=None, ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.enn = enn self.n_jobs = n_jobs def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError( "smote needs to be a SMOTE object." "Got {} instead.".format(type(self.smote)) ) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_jobs=self.n_jobs, ) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): self.enn_ = clone(self.enn) else: raise ValueError( "enn needs to be an EditedNearestNeighbours." " Got {} instead.".format(type(self.enn)) ) # Otherwise create a default EditedNearestNeighbours else: self.enn_ = EditedNearestNeighbours( sampling_strategy="all", n_jobs=self.n_jobs ) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.enn_.fit_resample(X_res, y_res) imbalanced-learn-0.7.0/imblearn/combine/_smote_tomek.py000066400000000000000000000111441366766276300231670ustar00rootroot00000000000000"""Class to perform over-sampling using SMOTE and cleaning using Tomek links.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from sklearn.base import clone from sklearn.utils import check_X_y from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTETomek(BaseSampler): """Over-sampling using SMOTE and cleaning using Tomek links. Combine over- and under-sampling using SMOTE and Tomek links. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} smote : object, default=None The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`imblearn.over_sampling.SMOTE` object with default parameters will be given. tomek : object, default=None The :class:`imblearn.under_sampling.TomekLinks` object to use. If not given, a :class:`imblearn.under_sampling.TomekLinks` object with sampling strategy='all' will be given. {n_jobs} See Also -------- SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited Nearest Neighbours. Notes ----- The methos is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and TomekLinks regarding the scheme which used. References ---------- .. [1] G. Batista, B. Bazzan, M. Monard, "Balancing Training Data for Automated Annotation of Keywords: a Case Study," In WOB, 10-18, 2003. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import \ SMOTETomek # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> smt = SMOTETomek(random_state=42) >>> X_res, y_res = smt.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _sampling_type = "over-sampling" @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, smote=None, tomek=None, n_jobs=None, ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.tomek = tomek self.n_jobs = n_jobs def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError( "smote needs to be a SMOTE object." "Got {} instead.".format(type(self.smote)) ) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_jobs=self.n_jobs, ) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = clone(self.tomek) else: raise ValueError( "tomek needs to be a TomekLinks object." "Got {} instead.".format(type(self.tomek)) ) # Otherwise create a default TomekLinks else: self.tomek_ = TomekLinks( sampling_strategy="all", n_jobs=self.n_jobs ) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.tomek_.fit_resample(X_res, y_res) imbalanced-learn-0.7.0/imblearn/combine/tests/000077500000000000000000000000001366766276300212715ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/combine/tests/__init__.py000066400000000000000000000000001366766276300233700ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/combine/tests/test_smote_enn.py000066400000000000000000000121321366766276300246700ustar00rootroot00000000000000"""Test the module SMOTE ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.combine import SMOTEENN from imblearn.under_sampling import EditedNearestNeighbours from imblearn.over_sampling import SMOTE RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED), enn=EditedNearestNeighbours(sampling_strategy="all"), random_state=RND_SEED, ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN( sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(sampling_strategy="all") smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_parallelisation(): # Check if default job count is none smt = SMOTEENN(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs is None assert smt.smote_.n_jobs is None assert smt.enn_.n_jobs is None # Check if job count is set smt = SMOTEENN(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.enn_.n_jobs == 8 @pytest.mark.parametrize( "smote_params, err_msg", [ ({"smote": "rnd"}, "smote needs to be a SMOTE"), ({"enn": "rnd"}, "enn needs to be an "), ], ) def test_error_wrong_object(smote_params, err_msg): smt = SMOTEENN(**smote_params) with pytest.raises(ValueError, match=err_msg): smt.fit_resample(X, Y) imbalanced-learn-0.7.0/imblearn/combine/tests/test_smote_tomek.py000066400000000000000000000135611366766276300252360ustar00rootroot00000000000000"""Test the module SMOTE ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks RND_SEED = 0 X = np.array( [ [0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.06514042, -0.0770537], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.27410027, -0.54194484], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTETomek( sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.45784496, -0.1053161], ] ) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(sampling_strategy="all") smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_parallelisation(): # Check if default job count is None smt = SMOTETomek(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs is None assert smt.smote_.n_jobs is None assert smt.tomek_.n_jobs is None # Check if job count is set smt = SMOTETomek(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.tomek_.n_jobs == 8 @pytest.mark.parametrize( "smote_params, err_msg", [ ({"smote": "rnd"}, "smote needs to be a SMOTE"), ({"tomek": "rnd"}, "tomek needs to be a TomekLinks"), ], ) def test_error_wrong_object(smote_params, err_msg): smt = SMOTETomek(**smote_params) with pytest.raises(ValueError, match=err_msg): smt.fit_resample(X, Y) imbalanced-learn-0.7.0/imblearn/datasets/000077500000000000000000000000001366766276300203235ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/datasets/__init__.py000066400000000000000000000003201366766276300224270ustar00rootroot00000000000000""" The :mod:`imblearn.datasets` provides methods to generate imbalanced data. """ from ._imbalance import make_imbalance from ._zenodo import fetch_datasets __all__ = ["make_imbalance", "fetch_datasets"] imbalanced-learn-0.7.0/imblearn/datasets/_imbalance.py000066400000000000000000000101021366766276300227410ustar00rootroot00000000000000"""Transform a dataset into an imbalanced dataset.""" # Authors: Dayvid Oliveira # Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter from ..under_sampling import RandomUnderSampler from ..utils import check_sampling_strategy from ..utils._validation import _deprecate_positional_args @_deprecate_positional_args def make_imbalance( X, y, *, sampling_strategy=None, random_state=None, verbose=False, **kwargs ): """Turns a dataset into an imbalanced dataset with a specific sampling strategy. A simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide `. Parameters ---------- X : {array-like, dataframe}, shape (n_samples, n_features) Matrix containing the data to be imbalanced. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. sampling_strategy : dict, or callable, Ratio to use for resampling the data set. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. verbose : bool, optional (default=False) Show information regarding the sampling. kwargs : dict, optional Dictionary of additional keyword arguments to pass to ``sampling_strategy``. Returns ------- X_resampled : {ndarray, dataframe}, shape (n_samples_new, n_features) The array containing the imbalanced data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` Notes ----- See :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`, :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> data = load_iris() >>> X, y = data.data, data.target >>> print('Distribution before imbalancing: {}'.format(Counter(y))) Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) >>> X_res, y_res = make_imbalance(X, y, ... sampling_strategy={0: 10, 1: 20, 2: 30}, ... random_state=42) >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) """ target_stats = Counter(y) # restrict ratio to be a dict or a callable if isinstance(sampling_strategy, dict) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( sampling_strategy, y, "under-sampling", **kwargs ) else: raise ValueError( "'sampling_strategy' has to be a dictionary or a " "function returning a dictionary. Got {} instead.".format( type(sampling_strategy) ) ) if verbose: print( "The original target distribution in the dataset is: %s", target_stats, ) rus = RandomUnderSampler( sampling_strategy=sampling_strategy_, replacement=False, random_state=random_state, ) X_resampled, y_resampled = rus.fit_resample(X, y) if verbose: print("Make the dataset imbalanced: %s", Counter(y_resampled)) return X_resampled, y_resampled imbalanced-learn-0.7.0/imblearn/datasets/_zenodo.py000066400000000000000000000310131366766276300223300ustar00rootroot00000000000000"""Collection of imbalanced datasets. This collection of datasets has been proposed in [1]_. The characteristics of the available datasets are presented in the table below. ID Name Repository & Target Ratio #S #F 1 ecoli UCI, target: imU 8.6:1 336 7 2 optical_digits UCI, target: 8 9.1:1 5,620 64 3 satimage UCI, target: 4 9.3:1 6,435 36 4 pen_digits UCI, target: 5 9.4:1 10,992 16 5 abalone UCI, target: 7 9.7:1 4,177 10 6 sick_euthyroid UCI, target: sick euthyroid 9.8:1 3,163 42 7 spectrometer UCI, target: >=44 11:1 531 93 8 car_eval_34 UCI, target: good, v good 12:1 1,728 21 9 isolet UCI, target: A, B 12:1 7,797 617 10 us_crime UCI, target: >0.65 12:1 1,994 100 11 yeast_ml8 LIBSVM, target: 8 13:1 2,417 103 12 scene LIBSVM, target: >one label 13:1 2,407 294 13 libras_move UCI, target: 1 14:1 360 90 14 thyroid_sick UCI, target: sick 15:1 3,772 52 15 coil_2000 KDD, CoIL, target: minority 16:1 9,822 85 16 arrhythmia UCI, target: 06 17:1 452 278 17 solar_flare_m0 UCI, target: M->0 19:1 1,389 32 18 oil UCI, target: minority 22:1 937 49 19 car_eval_4 UCI, target: vgood 26:1 1,728 21 20 wine_quality UCI, wine, target: <=4 26:1 4,898 11 21 letter_img UCI, target: Z 26:1 20,000 16 22 yeast_me2 UCI, target: ME2 28:1 1,484 8 23 webpage LIBSVM, w7a, target: minority 33:1 34,780 300 24 ozone_level UCI, ozone, data 34:1 2,536 72 25 mammography UCI, target: minority 42:1 11,183 6 26 protein_homo KDD CUP 2004, minority 111:1 145,751 74 27 abalone_19 UCI, target: 19 130:1 4,177 10 References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ # Author: Guillaume Lemaitre # License: BSD 3 clause from collections import OrderedDict import tarfile from io import BytesIO from os import makedirs from os.path import join, isfile from urllib.request import urlopen import numpy as np from sklearn.datasets import get_data_home from sklearn.utils import Bunch from sklearn.utils import check_random_state from ..utils._validation import _deprecate_positional_args URL = ( "https://zenodo.org/record/61452/files/" "benchmark-imbalanced-learn.tar.gz" ) PRE_FILENAME = "x" POST_FILENAME = "data.npz" MAP_NAME_ID_KEYS = [ "ecoli", "optical_digits", "satimage", "pen_digits", "abalone", "sick_euthyroid", "spectrometer", "car_eval_34", "isolet", "us_crime", "yeast_ml8", "scene", "libras_move", "thyroid_sick", "coil_2000", "arrhythmia", "solar_flare_m0", "oil", "car_eval_4", "wine_quality", "letter_img", "yeast_me2", "webpage", "ozone_level", "mammography", "protein_homo", "abalone_19", ] MAP_NAME_ID = OrderedDict() MAP_ID_NAME = OrderedDict() for v, k in enumerate(MAP_NAME_ID_KEYS): MAP_NAME_ID[k] = v + 1 MAP_ID_NAME[v + 1] = k @_deprecate_positional_args def fetch_datasets( *, data_home=None, filter_data=None, download_if_missing=True, random_state=None, shuffle=False, verbose=False, ): """Load the benchmark datasets from Zenodo, downloading it if necessary. Parameters ---------- data_home : string, optional (default=None) Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. filter_data : tuple of str/int or None, optional (default=None) A tuple containing the ID or the name of the datasets to be returned. Refer to the above table to get the ID and name of the datasets. download_if_missing : boolean, optional (default=True) If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, optional (default=None) Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, optional (default=False) Whether to shuffle dataset. verbose : bool, optional (default=False) Show information regarding the fetching. Returns ------- datasets : OrderedDict of Bunch object, The ordered is defined by ``filter_data``. Each Bunch object --- refered as dataset --- have the following attributes: dataset.data : ndarray, shape (n_samples, n_features) dataset.target : ndarray, shape (n_samples, ) dataset.DESCR : string Description of the each dataset. Notes ----- This collection of datasets have been proposed in [1]_. The characteristics of the available datasets are presented in the table below. +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 111:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ data_home = get_data_home(data_home=data_home) zenodo_dir = join(data_home, "zenodo") datasets = OrderedDict() if filter_data is None: filter_data_ = MAP_NAME_ID.keys() else: list_data = MAP_NAME_ID.keys() filter_data_ = [] for it in filter_data: if isinstance(it, str): if it not in list_data: raise ValueError( "{} is not a dataset available. " "The available datasets are {}".format(it, list_data) ) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: raise ValueError( "The dataset with the ID={} is not an " "available dataset. The IDs are " "{}".format(it, range(1, 28)) ) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: raise ValueError( "The value in the tuple should be str or int." " Got {} instead.".format(type(it)) ) # go through the list and check if the data are available for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(zenodo_dir, filename) available = isfile(filename) if download_if_missing and not available: makedirs(zenodo_dir, exist_ok=True) if verbose: print("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) tar = tarfile.open(fileobj=f) tar.extractall(path=zenodo_dir) elif not download_if_missing and not available: raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) X, y = data["data"], data["label"] if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets imbalanced-learn-0.7.0/imblearn/datasets/tests/000077500000000000000000000000001366766276300214655ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/datasets/tests/__init__.py000066400000000000000000000000001366766276300235640ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/datasets/tests/test_imbalance.py000066400000000000000000000046121366766276300250140ustar00rootroot00000000000000"""Test the module easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import pytest import numpy as np from sklearn.datasets import load_iris from sklearn.datasets import fetch_openml from imblearn.datasets import make_imbalance @pytest.fixture def iris(): return load_iris(return_X_y=True) @pytest.mark.parametrize( "sampling_strategy, err_msg", [ ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), ({0: 10, 1: 70}, "should be less or equal to the original"), ("random-string", "has to be a dictionary or a function"), ], ) def test_make_imbalance_error(iris, sampling_strategy, err_msg): # we are reusing part of utils.check_sampling_strategy, however this is not # cover in the common tests so we will repeat it here X, y = iris with pytest.raises(ValueError, match=err_msg): make_imbalance(X, y, sampling_strategy=sampling_strategy) def test_make_imbalance_error_single_class(iris): X, y = iris y = np.zeros_like(y) with pytest.raises(ValueError, match="needs to have more than 1 class."): make_imbalance(X, y, sampling_strategy={0: 10}) @pytest.mark.parametrize( "sampling_strategy, expected_counts", [ ({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50}), ], ) def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): X, y = iris _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy) assert Counter(y_) == expected_counts @pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array']) @pytest.mark.parametrize( "sampling_strategy, expected_counts", [ ({'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30}, {'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30}), ({'Iris-setosa': 10, 'Iris-versicolor': 20}, {'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 50}), ], ) def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts): pytest.importorskip("pandas") X, y = fetch_openml('iris', version=1, return_X_y=True, as_frame=as_frame) X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy) if as_frame: assert hasattr(X_res, "loc") assert Counter(y_res) == expected_counts imbalanced-learn-0.7.0/imblearn/datasets/tests/test_zenodo.py000066400000000000000000000053721366766276300244030ustar00rootroot00000000000000"""Test the datasets loader. Skipped if datasets is not already downloaded to data_home. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest from imblearn.datasets import fetch_datasets from sklearn.utils._testing import SkipTest DATASET_SHAPE = { "ecoli": (336, 7), "optical_digits": (5620, 64), "satimage": (6435, 36), "pen_digits": (10992, 16), "abalone": (4177, 10), "sick_euthyroid": (3163, 42), "spectrometer": (531, 93), "car_eval_34": (1728, 21), "isolet": (7797, 617), "us_crime": (1994, 100), "yeast_ml8": (2417, 103), "scene": (2407, 294), "libras_move": (360, 90), "thyroid_sick": (3772, 52), "coil_2000": (9822, 85), "arrhythmia": (452, 278), "solar_flare_m0": (1389, 32), "oil": (937, 49), "car_eval_4": (1728, 21), "wine_quality": (4898, 11), "letter_img": (20000, 16), "yeast_me2": (1484, 8), "webpage": (34780, 300), "ozone_level": (2536, 72), "mammography": (11183, 6), "protein_homo": (145751, 74), "abalone_19": (4177, 10), } def fetch(*args, **kwargs): return fetch_datasets(*args, download_if_missing=True, **kwargs) @pytest.mark.xfail def test_fetch(): try: datasets1 = fetch(shuffle=True, random_state=42) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") datasets2 = fetch(shuffle=True, random_state=37) for k in DATASET_SHAPE.keys(): X1, X2 = datasets1[k].data, datasets2[k].data assert DATASET_SHAPE[k] == X1.shape assert X1.shape == X2.shape y1, y2 = datasets1[k].target, datasets2[k].target assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape def test_fetch_filter(): try: datasets1 = fetch( filter_data=tuple([1]), shuffle=True, random_state=42 ) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") datasets2 = fetch( filter_data=tuple(["ecoli"]), shuffle=True, random_state=37 ) X1, X2 = datasets1["ecoli"].data, datasets2["ecoli"].data assert DATASET_SHAPE["ecoli"] == X1.shape assert X1.shape == X2.shape assert X1.sum() == pytest.approx(X2.sum()) y1, y2 = datasets1["ecoli"].target, datasets2["ecoli"].target assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape @pytest.mark.parametrize( "filter_data, err_msg", [ (("rnf",), "is not a dataset available"), ((-1,), "dataset with the ID="), ((100,), "dataset with the ID="), ((1.00,), "value in the tuple"), ], ) def test_fetch_error(filter_data, err_msg): with pytest.raises(ValueError, match=err_msg): fetch_datasets(filter_data=filter_data) imbalanced-learn-0.7.0/imblearn/ensemble/000077500000000000000000000000001366766276300203055ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/ensemble/__init__.py000066400000000000000000000007211366766276300224160ustar00rootroot00000000000000""" The :mod:`imblearn.ensemble` module include methods generating under-sampled subsets combined inside an ensemble. """ from ._easy_ensemble import EasyEnsembleClassifier from ._bagging import BalancedBaggingClassifier from ._forest import BalancedRandomForestClassifier from ._weight_boosting import RUSBoostClassifier __all__ = [ "BalancedBaggingClassifier", "BalancedRandomForestClassifier", "EasyEnsembleClassifier", "RUSBoostClassifier", ] imbalanced-learn-0.7.0/imblearn/ensemble/_bagging.py000066400000000000000000000234661366766276300224270ustar00rootroot00000000000000"""Bagging classifier trained on balanced bootstrap samples.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import numpy as np from sklearn.base import clone from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from ..pipeline import Pipeline from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_target_type, check_sampling_strategy from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BalancedBaggingClassifier(BaggingClassifier): """A Bagging classifier with additional balancing. This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a ``RandomUnderSampler``. Read more in the :ref:`User Guide `. Parameters ---------- base_estimator : object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. n_estimators : int, default=10 The number of base estimators in the ensemble. max_samples : int or float, default=1.0 The number of samples to draw from X to train each base estimator. - If int, then draw ``max_samples`` samples. - If float, then draw ``max_samples * X.shape[0]`` samples. max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw ``max_features`` features. - If float, then draw ``max_features * X.shape[1]`` features. bootstrap : bool, default=True Whether samples are drawn with replacement. bootstrap_features : bool, default=False Whether features are drawn with replacement. oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization error. warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {n_jobs} {random_state} verbose : int, default=0 Controls the verbosity of the building process. Attributes ---------- base_estimator_ : estimator The base estimator from which the ensemble is grown. n_features_ : int The number of features when `fit` is performed. estimators_ : list of estimators The collection of fitted base estimators. estimators_samples_ : list of ndarray The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Each subset is defined by a boolean mask. estimators_features_ : list of ndarray The subset of drawn features for each base estimator. classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int or list The number of classes. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, ``oob_decision_function_`` might contain NaN. See Also -------- BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. Notes ----- This is possible to turn this classifier into a balanced random forest [5]_ by passing a :class:`sklearn.tree.DecisionTreeClassifier` with `max_features='auto'` as a base estimator. See :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_ensemble_classifier.py`. References ---------- .. [1] L. Breiman, "Pasting small votes for classification in large databases and on-line", Machine Learning, 36(1), 85-103, 1999. .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, 1996. .. [3] T. Ho, "The random subspace method for constructing decision forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, 1998. .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. .. [5] Chen, Chao, Andy Liaw, and Leo Breiman. "Using random forest to learn imbalanced data." University of California, Berkeley 110, 2004. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import \ BalancedBaggingClassifier # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> bbc = BalancedBaggingClassifier(random_state=42) >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) [[ 23 0] [ 2 225]] """ @_deprecate_positional_args def __init__( self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, sampling_strategy="auto", replacement=False, n_jobs=None, random_state=None, verbose=0, ): super().__init__( base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose, ) self.sampling_strategy = sampling_strategy self.replacement = replacement def _validate_y(self, y): y_encoded = super()._validate_y(y) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, 'under-sampling', ).items() } else: self._sampling_strategy = self.sampling_strategy return y_encoded def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError( "n_estimators must be an integer, " "got {}.".format(type(self.n_estimators)) ) if self.n_estimators <= 0: raise ValueError( "n_estimators must be greater than zero, " "got {}.".format(self.n_estimators) ) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) self.base_estimator_ = Pipeline( [ ( "sampler", RandomUnderSampler( sampling_strategy=self._sampling_strategy, replacement=self.replacement, ), ), ("classifier", base_estimator), ] ) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values. Returns ------- self : object Returns self. """ check_target_type(y) # RandomUnderSampler is not supporting sample_weight. We need to pass # None. return self._fit(X, y, self.max_samples, sample_weight=None) def _more_tags(self): tags = super()._more_tags() tags_key = "_xfail_checks" failing_test = "check_estimators_nan_inf" reason = "Fails because the sampler removed infinity and NaN values" if tags_key in tags: tags[tags_key][failing_test] = reason else: tags[tags_key] = {failing_test: reason} return tags imbalanced-learn-0.7.0/imblearn/ensemble/_easy_ensemble.py000066400000000000000000000161721366766276300236400ustar00rootroot00000000000000"""Class to perform under-sampling using easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import numpy as np from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_target_type, check_sampling_strategy from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args from ..pipeline import Pipeline MAX_INT = np.iinfo(np.int32).max @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class EasyEnsembleClassifier(BaggingClassifier): """Bag of balanced boosted learners also known as EasyEnsemble. This algorithm is known as EasyEnsemble [1]_. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling. Read more in the :ref:`User Guide `. Parameters ---------- n_estimators : int, default=10 Number of AdaBoost learners in the ensemble. base_estimator : object, default=AdaBoostClassifier() The base AdaBoost classifier used in the inner ensemble. Note that you can set the number of inner learner by passing your own instance. warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {n_jobs} {random_state} verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- base_estimator_ : estimator The base estimator from which the ensemble is grown. estimators_ : list of estimators The collection of fitted base estimators. classes_ : array, shape (n_classes,) The classes labels. n_classes_ : int or list The number of classes. See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. Notes ----- The method is described in [1]_. Supports multi-class resampling by sampling each class independently. References ---------- .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import \ EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> eec = EasyEnsembleClassifier(random_state=42) >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) [[ 23 0] [ 2 225]] """ @_deprecate_positional_args def __init__( self, n_estimators=10, base_estimator=None, *, warm_start=False, sampling_strategy="auto", replacement=False, n_jobs=None, random_state=None, verbose=0, ): super().__init__( base_estimator, n_estimators=n_estimators, max_samples=1.0, max_features=1.0, bootstrap=False, bootstrap_features=False, oob_score=False, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose, ) self.sampling_strategy = sampling_strategy self.replacement = replacement def _validate_y(self, y): y_encoded = super()._validate_y(y) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, 'under-sampling', ).items() } else: self._sampling_strategy = self.sampling_strategy return y_encoded def _validate_estimator(self, default=AdaBoostClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError( "n_estimators must be an integer, " "got {}.".format(type(self.n_estimators)) ) if self.n_estimators <= 0: raise ValueError( "n_estimators must be greater than zero, " "got {}.".format(self.n_estimators) ) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) self.base_estimator_ = Pipeline( [ ( "sampler", RandomUnderSampler( sampling_strategy=self._sampling_strategy, replacement=self.replacement, ), ), ("classifier", base_estimator), ] ) def fit(self, X, y): """Train the ensemble on the training set. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values. Returns ------- self : object Returns self. """ check_target_type(y) # RandomUnderSampler is not supporting sample_weight. We need to pass # None. return self._fit(X, y, self.max_samples, sample_weight=None) imbalanced-learn-0.7.0/imblearn/ensemble/_forest.py000066400000000000000000000571021366766276300223250ustar00rootroot00000000000000"""Forest classifiers trained on balanced boostrasp samples.""" # Authors: Guillaume Lemaitre # License: MIT import numbers from warnings import warn from copy import deepcopy import numpy as np from numpy import float32 as DTYPE from numpy import float64 as DOUBLE from scipy.sparse import issparse from joblib import Parallel, delayed from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.ensemble._forest import _get_n_samples_bootstrap from sklearn.ensemble._forest import _parallel_build_trees from sklearn.ensemble._forest import _generate_unsampled_indices from sklearn.exceptions import DataConversionWarning from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from sklearn.utils.validation import _check_sample_weight from ..pipeline import make_pipeline from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import check_sampling_strategy from ..utils._validation import _deprecate_positional_args MAX_INT = np.iinfo(np.int32).max def _local_parallel_build_trees( sampler, tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None ): # resample before to fit the tree X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_) if _get_n_samples_bootstrap is not None: n_samples_bootstrap = min(n_samples_bootstrap, X_resampled.shape[0]) tree = _parallel_build_trees( tree, forest, X_resampled, y_resampled, sample_weight, tree_idx, n_trees, verbose=verbose, class_weight=class_weight, n_samples_bootstrap=n_samples_bootstrap, ) return sampler, tree @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BalancedRandomForestClassifier(RandomForestClassifier): """A balanced random forest classifier. A balanced random forest randomly under-samples each boostrap sample to balance it. Read more in the :ref:`User Guide `. Parameters ---------- n_estimators : int, default=100 The number of trees in the forest. criterion : str, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int, float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_samples_leaf : int, float, default=1 The minimum number of samples required to be at a leaf node: - If int, then consider ``min_samples_leaf`` as the minimum number. - If float, then ``min_samples_leaf`` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : {{"auto", "sqrt", "log2"}}, int, float, or None, \ default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. bootstrap : bool, default=True Whether bootstrap samples are used when building trees. oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization accuracy. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {n_jobs} {random_state} verbose : int, default=0 Controls the verbosity of the tree building process. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. class_weight : dict, list of dicts, {{"balanced", "balanced_subsample"}}, \ default=None Weights associated with classes in the form dictionary with the key being the class_label and the value the weight. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{{0: 1, 1: 1}}, {{0: 1, 1: 5}}, {{0: 1, 1: 1}}, {{0: 1, 1: 1}}] instead of [{{1:1}}, {{2:5}}, {{3:1}}, {{4:1}}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. .. versionadded:: 0.6 Added in `scikit-learn` in 0.22 max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1)`. Be aware that the final number samples used will be the minimum between the number of samples given in `max_samples` and the number of samples obtained after resampling. .. versionadded:: 0.6 Added in `scikit-learn` in 0.22 Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. samplers_ : list of RandomUnderSampler The collection of fitted samplers. pipelines_ : list of Pipeline. The collection of fitted pipelines (samplers + trees). classes_ : ndarray of shape (n_classes,) or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). n_classes_ : int or list The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. n_outputs_ : int The number of outputs when ``fit`` is performed. feature_importances_ : ndarray of shape (n_features,) The feature importances (the higher, the more important the feature). oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, `oob_decision_function_` might contain NaN. See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. References ---------- .. [1] Chen, Chao, Andy Liaw, and Leo Breiman. "Using random forest to learn imbalanced data." University of California, Berkeley 110 (2004): 1-12. Examples -------- >>> from imblearn.ensemble import BalancedRandomForestClassifier >>> from sklearn.datasets import make_classification >>> >>> X, y = make_classification(n_samples=1000, n_classes=3, ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = BalancedRandomForestClassifier(max_depth=2, random_state=0) >>> clf.fit(X, y) # doctest: +ELLIPSIS BalancedRandomForestClassifier(...) >>> print(clf.feature_importances_) # doctest: +ELLIPSIS [...] >>> print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) [1] """ @_deprecate_positional_args def __init__( self, n_estimators=100, *, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, sampling_strategy="auto", replacement=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, ): super().__init__( criterion=criterion, max_depth=max_depth, n_estimators=n_estimators, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, max_samples=max_samples, ) self.sampling_strategy = sampling_strategy self.replacement = replacement def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError( "n_estimators must be an integer, " "got {}.".format(type(self.n_estimators)) ) if self.n_estimators <= 0: raise ValueError( "n_estimators must be greater than zero, " "got {}.".format(self.n_estimators) ) if self.base_estimator is not None: self.base_estimator_ = clone(self.base_estimator) else: self.base_estimator_ = clone(default) self.base_sampler_ = RandomUnderSampler( sampling_strategy=self._sampling_strategy, replacement=self.replacement, ) def _make_sampler_estimator(self, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.base_estimator_) estimator.set_params( **{p: getattr(self, p) for p in self.estimator_params} ) sampler = clone(self.base_sampler_) if random_state is not None: _set_random_states(estimator, random_state) _set_random_states(sampler, random_state) return estimator, sampler def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,) Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object The fitted instance. """ # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." ) X, y = self._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output _, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2, ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y_encoded, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_[0] == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, 'under-sampling', ).items() } else: self._sampling_strategy = self.sampling_strategy if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError( "Out of bag estimation only available" " if bootstrap=True" ) random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] self.samplers_ = [] self.pipelines_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_)) ) elif n_more_estimators == 0: warn( "Warm-start fitting without increasing n_estimators does not " "fit new trees." ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [] samplers = [] for _ in range(n_more_estimators): tree, sampler = self._make_sampler_estimator( random_state=random_state ) trees.append(tree) samplers.append(sampler) # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, we respect any parallel_backend contexts set # at a higher level, since correctness does not rely on using # threads. samplers_trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads" )( delayed(_local_parallel_build_trees)( s, t, self, X, y_encoded, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, ) for i, (s, t) in enumerate(zip(samplers, trees)) ) samplers, trees = zip(*samplers_trees) # Collect newly grown trees self.estimators_.extend(trees) self.samplers_.extend(samplers) # Create pipeline with the fitted samplers and trees self.pipelines_.extend( [ make_pipeline(deepcopy(s), deepcopy(t)) for s, t in zip(samplers, trees) ] ) if self.oob_score: self._set_oob_score(X, y_encoded) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def _set_oob_score(self, X, y): """Compute out-of-bag score.""" X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_)] for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples ) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap ) p_estimator = estimator.predict_proba( X_resample[unsampled_indices, :], check_input=False ) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): indices = sampler.sample_indices_[unsampled_indices] predictions[k][indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") with np.errstate(invalid="ignore", divide="ignore"): # with the resampling, we are likely to have rows not included # for the OOB score leading to division by zero decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) mask_scores = np.isnan(np.sum(decision, axis=1)) oob_decision_function.append(decision) oob_score += np.mean( y[~mask_scores, k] == np.argmax(predictions[k][~mask_scores], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_ def _more_tags(self): return {"multioutput": False} imbalanced-learn-0.7.0/imblearn/ensemble/_weight_boosting.py000066400000000000000000000266741366766276300242300ustar00rootroot00000000000000from copy import deepcopy import numpy as np from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble._base import _set_random_states from sklearn.utils import _safe_indexing from ..under_sampling.base import BaseUnderSampler from ..under_sampling import RandomUnderSampler from ..pipeline import make_pipeline from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RUSBoostClassifier(AdaBoostClassifier): """Random under-sampling integrated in the learning of AdaBoost. During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm. Read more in the :ref:`User Guide `. Parameters ---------- base_estimator : object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is ``DecisionTreeClassifier(max_depth=1)``. n_estimators : int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. learning_rate : float, default=1.0 Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. algorithm : {{'SAMME', 'SAMME.R'}}, default='SAMME.R' If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {random_state} Attributes ---------- base_estimator_ : estimator The base estimator from which the ensemble is grown. estimators_ : list of classifiers The collection of fitted sub-estimators. samplers_ : list of RandomUnderSampler The collection of fitted samplers. pipelines_ : list of Pipeline The collection of fitted pipelines (samplers + trees). classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int The number of classes. estimator_weights_ : ndarray of shape (n_estimator,) Weights for each estimator in the boosted ensemble. estimator_errors_ : ndarray of shape (n_estimator,) Classification error for each estimator in the boosted ensemble. feature_importances_ : ndarray of shape (n_features,) The feature importances if supported by the ``base_estimator``. See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. References ---------- .. [1] Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197. Examples -------- >>> from imblearn.ensemble import RUSBoostClassifier >>> from sklearn.datasets import make_classification >>> >>> X, y = make_classification(n_samples=1000, n_classes=3, ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = RUSBoostClassifier(random_state=0) >>> clf.fit(X, y) # doctest: +ELLIPSIS RUSBoostClassifier(...) >>> clf.predict(X) # doctest: +ELLIPSIS array([...]) """ @_deprecate_positional_args def __init__( self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm="SAMME.R", sampling_strategy="auto", replacement=False, random_state=None, ): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state, ) self.sampling_strategy = sampling_strategy self.replacement = replacement def fit(self, X, y, sample_weight=None): """Build a boosted classifier from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. DOK and LIL are converted to CSR. y : array-like of shape (n_samples,) The target values (class labels). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, the sample weights are initialized to ``1 / n_samples``. Returns ------- self : object Returns self. """ check_target_type(y) self.samplers_ = [] self.pipelines_ = [] super().fit(X, y, sample_weight) return self def _validate_estimator(self): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" super()._validate_estimator() self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, replacement=self.replacement, ) def _make_sampler_estimator(self, append=True, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.base_estimator_) estimator.set_params( **{p: getattr(self, p) for p in self.estimator_params} ) sampler = clone(self.base_sampler_) if random_state is not None: _set_random_states(estimator, random_state) _set_random_states(sampler, random_state) if append: self.estimators_.append(estimator) self.samplers_.append(sampler) self.pipelines_.append( make_pipeline(deepcopy(sampler), deepcopy(estimator)) ) return estimator, sampler def _boost_real(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME.R real algorithm.""" estimator, sampler = self._make_sampler_estimator( random_state=random_state ) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = _safe_indexing( sample_weight, sampler.sample_indices_ ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict_proba = estimator.predict_proba(X) if iboost == 0: self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) y_predict = self.classes_.take( np.argmax(y_predict_proba, axis=1), axis=0 ) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean( np.average(incorrect, weights=sample_weight, axis=0) ) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1.0, 0.0 # Construct y coding as described in Zhu et al [2]: # # y_k = 1 if c == k else -1 / (K - 1) # # where K == n_classes_ and c, k in [0, K) are indices along the second # axis of the y coding with c being the index corresponding to the true # class label. n_classes = self.n_classes_ classes = self.classes_ y_codes = np.array([-1.0 / (n_classes - 1), 1.0]) y_coding = y_codes.take(classes == y[:, np.newaxis]) # Displace zero probabilities so the log is defined. # Also fix negative elements which may occur with # negative sample weights. proba = y_predict_proba # alias for readability np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) # Boost weight using multi-class AdaBoost SAMME.R alg estimator_weight = ( -1.0 * self.learning_rate * ((n_classes - 1.0) / n_classes) * (y_coding * np.log(y_predict_proba)).sum(axis=1) ) # Only boost the weights if it will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp( estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)) ) return sample_weight, 1.0, estimator_error def _boost_discrete(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME discrete algorithm.""" estimator, sampler = self._make_sampler_estimator( random_state=random_state ) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = _safe_indexing( sample_weight, sampler.sample_indices_ ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict = estimator.predict(X) if iboost == 0: self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean( np.average(incorrect, weights=sample_weight, axis=0) ) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1.0, 0.0 n_classes = self.n_classes_ # Stop if the error is at least as bad as random guessing if estimator_error >= 1.0 - (1.0 / n_classes): self.estimators_.pop(-1) self.samplers_.pop(-1) self.pipelines_.pop(-1) if len(self.estimators_) == 0: raise ValueError( "BaseClassifier in AdaBoostClassifier " "ensemble is worse than random, ensemble " "can not be fit." ) return None, None, None # Boost weight using multi-class AdaBoost SAMME alg estimator_weight = self.learning_rate * ( np.log((1.0 - estimator_error) / estimator_error) + np.log(n_classes - 1.0) ) # Only boost the weights if I will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp( estimator_weight * incorrect * (sample_weight > 0) ) return sample_weight, estimator_weight, estimator_error imbalanced-learn-0.7.0/imblearn/ensemble/tests/000077500000000000000000000000001366766276300214475ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/ensemble/tests/__init__.py000066400000000000000000000000001366766276300235460ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/ensemble/tests/test_bagging.py000066400000000000000000000407431366766276300244660ustar00rootroot00000000000000"""Test the module ensemble classifiers.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.model_selection import ( GridSearchCV, ParameterGrid, train_test_split, ) from sklearn.dummy import DummyClassifier from sklearn.linear_model import Perceptron, LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from imblearn.datasets import make_imbalance from imblearn.ensemble import BalancedBaggingClassifier from imblearn.pipeline import make_pipeline from imblearn.under_sampling import RandomUnderSampler iris = load_iris() def test_balanced_bagging_classifier(): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) grid = ParameterGrid( { "max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False], } ) for base_estimator in [ None, DummyClassifier(strategy="prior"), Perceptron(max_iter=1000, tol=1e-3), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(gamma="scale"), ]: for params in grid: BalancedBaggingClassifier( base_estimator=base_estimator, random_state=0, **params ).fit(X_train, y_train).predict(X_test) def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, sampling_strategy={}, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) == base_estimator.score( X_train, y_train ) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) < base_estimator.score( X_train, y_train ) def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0, ).fit(X_train, y_train) unique_features = [ np.unique(features).shape[0] for features in ensemble.estimators_features_ ] assert np.median(unique_features) < X.shape[1] def test_probability(): # Predict probabilities. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0 ).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression( solver="lbfgs", multi_class="auto" ), random_state=0, max_samples=5, ) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators with pytest.warns(UserWarning): BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train) def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state ), KNeighborsClassifier(), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_error(): # Test that it gives proper exception on deficient input. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50} ) base = DecisionTreeClassifier() # Test n_estimators with pytest.raises(ValueError): BalancedBaggingClassifier(base, n_estimators=1.5).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, n_estimators=-1).fit(X, y) # Test max_samples with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_samples=-1).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_samples=0.0).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_samples=2.0).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_samples=1000).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_samples="foobar").fit(X, y) # Test max_features with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_features=-1).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_features=0.0).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_features=2.0).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_features=5).fit(X, y) with pytest.raises(ValueError): BalancedBaggingClassifier(base, max_features="foobar").fit(X, y) # Test support of decision_function assert not ( hasattr(BalancedBaggingClassifier(base).fit(X, y), "decision_function") ) def test_gridsearch(): # Check that bagging ensembles can be grid-searched. # Transform iris into a binary classification task X, y = iris.data, iris.target.copy() y[y == 2] = 1 # Grid search with scoring based on decision_function parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)} GridSearchCV( BalancedBaggingClassifier(SVC(gamma="scale")), parameters, cv=3, scoring="roc_auc", ).fit(X, y) def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier(None, n_jobs=3, random_state=0).fit( X_train, y_train ) assert isinstance( ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier ) ensemble = BalancedBaggingClassifier( DecisionTreeClassifier(), n_jobs=3, random_state=0 ).fit(X_train, y_train) assert isinstance( ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier ) ensemble = BalancedBaggingClassifier( Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0 ).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron) def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2, ) estimator.fit(X, y).predict(X) def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws } def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y) def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier( n_estimators=5, warm_start=True, random_state=83 ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators does not" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test)) def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier( n_estimators=5, warm_start=True, random_state=3141 ) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier( n_estimators=10, warm_start=False, random_state=3141 ) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2) def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier( n_estimators=5, warm_start=True, oob_score=True ) with pytest.raises(ValueError): clf.fit(X, y) def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) with pytest.raises(AttributeError): getattr(clf, "oob_score_") def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1, ) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier( LogisticRegression(solver="lbfgs", multi_class="auto"), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False, ) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs) def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1, ) bagging.fit(X, y) assert bagging._max_samples == max_samples imbalanced-learn-0.7.0/imblearn/ensemble/tests/test_easy_ensemble.py000066400000000000000000000170071366766276300257000ustar00rootroot00000000000000"""Test the module easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.feature_selection import SelectKBest from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import EasyEnsembleClassifier from imblearn.datasets import make_imbalance from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import make_pipeline iris = load_iris() # Generate a global dataset to use RND_SEED = 0 X = np.array( [ [0.5220963, 0.11349303], [0.59091459, 0.40692742], [1.10915364, 0.05718352], [0.22039505, 0.26469445], [1.35269503, 0.44812421], [0.85117925, 1.0185556], [-2.10724436, 0.70263997], [-0.23627356, 0.30254174], [-1.23195149, 0.15427291], [-0.58539673, 0.62515052], ] ) Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) @pytest.mark.parametrize("n_estimators", [10, 20]) @pytest.mark.parametrize( "base_estimator", [AdaBoostClassifier(n_estimators=5), AdaBoostClassifier(n_estimators=10)], ) def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier( n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED, ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert ( len(est.named_steps["classifier"]) == base_estimator.n_estimators ) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test) def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = EasyEnsembleClassifier(2, None, n_jobs=-1, random_state=0).fit( X_train, y_train ) assert isinstance( ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier ) ensemble = EasyEnsembleClassifier( 2, AdaBoostClassifier(), n_jobs=-1, random_state=0 ).fit(X_train, y_train) assert isinstance( ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier ) def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()), ) estimator.fit(X, y).predict(X) def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws } def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y) def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=83 ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test)) def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=3141 ) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier( n_estimators=10, warm_start=False, random_state=3141 ) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2) @pytest.mark.parametrize( "n_estimators, msg_error", [ (1.0, "n_estimators must be an integer"), (-10, "n_estimators must be greater than zero"), ], ) def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y) def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( X_train, y_train ) clf2 = make_pipeline( RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0) ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) parameters = { "n_estimators": [1, 2], "base_estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5, ) grid_search.fit(X, y) imbalanced-learn-0.7.0/imblearn/ensemble/tests/test_forest.py000066400000000000000000000140541366766276300243660ustar00rootroot00000000000000import pytest import numpy as np from sklearn.datasets import make_classification from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import BalancedRandomForestClassifier @pytest.fixture def imbalanced_dataset(): return make_classification( n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.05, 0.94], class_sep=0.8, random_state=0, ) @pytest.mark.parametrize( "forest_params, err_msg", [ ({"n_estimators": "whatever"}, "n_estimators must be an integer"), ({"n_estimators": -100}, "n_estimators must be greater than zero"), ( {"bootstrap": False, "oob_score": True}, "Out of bag estimation only", ), ], ) def test_balanced_random_forest_error( imbalanced_dataset, forest_params, err_msg ): brf = BalancedRandomForestClassifier(**forest_params) with pytest.raises(ValueError, match=err_msg): brf.fit(*imbalanced_dataset) def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): brf = BalancedRandomForestClassifier(n_estimators=5) brf.fit(*imbalanced_dataset) with pytest.raises(ValueError, match="must be larger or equal to"): brf.set_params(warm_start=True, n_estimators=2) brf.fit(*imbalanced_dataset) brf.set_params(n_estimators=10) brf.fit(*imbalanced_dataset) with pytest.warns(UserWarning, match="Warm-start fitting without"): brf.fit(*imbalanced_dataset) def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0 ) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators assert len(brf.estimators_) == n_estimators assert len(brf.pipelines_) == n_estimators assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1] def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0 ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = ( brf.pipelines_[idx] .named_steps["randomundersampler"] .fit_resample(X, y) ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2) def test_balanced_random_forest_sample_weight(imbalanced_dataset): rng = np.random.RandomState(42) X, y = imbalanced_dataset sample_weight = rng.rand(y.shape[0]) brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0) brf.fit(X, y, sample_weight) @pytest.mark.filterwarnings("ignore:Some inputs do not have OOB scores") def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, stratify=y ) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1, bootstrap=True ) with pytest.warns(UserWarning) and np.errstate( divide="ignore", invalid="ignore" ): est.fit(X, y) def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV( brf, {"n_estimators": (1, 2), "max_depth": (1, 2)}, cv=3 ) grid.fit(*imbalanced_dataset) def test_little_tree_with_small_max_samples(): rng = np.random.RandomState(1) X = rng.randn(10000, 2) y = rng.randn(10000) > 0 # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=2, ) est1.fit(X, y) est2.fit(X, y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg def test_balanced_random_forest_pruning(imbalanced_dataset): brf = BalancedRandomForestClassifier() brf.fit(*imbalanced_dataset) n_nodes_no_pruning = brf.estimators_[0].tree_.node_count brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015) brf_pruned.fit(*imbalanced_dataset) n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count assert n_nodes_no_pruning > n_nodes_pruning @pytest.mark.parametrize("ratio", [0.5, 0.1]) @pytest.mark.filterwarnings("ignore:Some inputs do not have OOB scores") def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1 imbalanced-learn-0.7.0/imblearn/ensemble/tests/test_weight_boosting.py000066400000000000000000000066761366766276300262720ustar00rootroot00000000000000import pytest import numpy as np from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import RUSBoostClassifier @pytest.fixture def imbalanced_dataset(): return make_classification( n_samples=10000, n_features=3, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.05, 0.94], class_sep=0.8, random_state=0, ) @pytest.mark.parametrize( "boosting_params, err_msg", [ ({"n_estimators": "whatever"}, "n_estimators must be an integer"), ({"n_estimators": -100}, "n_estimators must be greater than zero"), ], ) def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) with pytest.raises(ValueError, match=err_msg): rusboost.fit(*imbalanced_dataset) @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=1 ) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier( n_estimators=n_estimators, algorithm=algorithm, random_state=0 ) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert len( {sampler.random_state for sampler in rusboost.samplers_} ) == len(rusboost.samplers_) # each estimator in the ensemble should have different random state assert len({est.random_state for est in rusboost.estimators_}) == len( rusboost.estimators_ ) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score ) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) # Predictions should be the same when sample_weight are all ones y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) y_pred_no_sample_weight = rusboost.fit(X, y).predict(X) assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight) rng = np.random.RandomState(42) sample_weight = rng.rand(y.shape[0]) y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight) imbalanced-learn-0.7.0/imblearn/exceptions.py000066400000000000000000000006571366766276300212560ustar00rootroot00000000000000""" The :mod:`imblearn.exceptions` module includes all custom warnings and error classes and functions used across imbalanced-learn. """ # Authors: Guillaume Lemaitre # License: MIT def raise_isinstance_error(variable_name, possible_type, variable): raise ValueError( "{} has to be one of {}. Got {} instead.".format( variable_name, possible_type, type(variable) ) ) imbalanced-learn-0.7.0/imblearn/keras/000077500000000000000000000000001366766276300176205ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/keras/__init__.py000066400000000000000000000004001366766276300217230ustar00rootroot00000000000000"""The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset in keras.""" from ._generator import BalancedBatchGenerator from ._generator import balanced_batch_generator __all__ = ["BalancedBatchGenerator", "balanced_batch_generator"] imbalanced-learn-0.7.0/imblearn/keras/_generator.py000066400000000000000000000240001366766276300223130ustar00rootroot00000000000000"""Implement generators for ``keras`` which will balance the data.""" # This is a trick to avoid an error during tests collection with pytest. We # avoid the error when importing the package raise the error at the moment of # creating the instance. def import_keras(): """Try to import keras from keras and tensorflow. This is possible to import the sequence from keras or tensorflow. Keras is not ducktyping ``Sequence`` before 2.3 and we need import from all possible library to ensure that the ``isinstance(...)`` is not going to fail. This function can be modified when we support Keras 2.3. """ def import_from_keras(): try: import keras return (keras.utils.Sequence,), True except ImportError: return tuple(), False def import_from_tensforflow(): try: from tensorflow import keras return (keras.utils.Sequence,), True except ImportError: return tuple(), False ParentClassKeras, has_keras_k = import_from_keras() ParentClassTensorflow, has_keras_tf = import_from_tensforflow() has_keras = has_keras_k or has_keras_tf if has_keras: ParentClass = ParentClassKeras + ParentClassTensorflow else: ParentClass = (object,) return ParentClass, has_keras ParentClass, HAS_KERAS = import_keras() from scipy.sparse import issparse # noqa from sklearn.base import clone # noqa from sklearn.utils import _safe_indexing # noqa from sklearn.utils import check_random_state # noqa from ..under_sampling import RandomUnderSampler # noqa from ..utils import Substitution # noqa from ..utils._docstring import _random_state_docstring # noqa from ..tensorflow import balanced_batch_generator as tf_bbg # noqa from ..utils._validation import _deprecate_positional_args # noqa class BalancedBatchGenerator(*ParentClass): """Create balanced batches when training a keras model. Create a keras ``Sequence`` which is given to ``fit_generator``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. Parameters ---------- X : ndarray, shape (n_samples, n_features) Original imbalanced dataset. y : ndarray, shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray, shape (n_samples,) Sample weight. sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. keep_sparse : bool, optional (default=False) Either or not to conserve or not the sparsity of the input (i.e. ``X``, ``y``, ``sample_weight``). By default, the returned batches will be dense. random_state : int, RandomState instance or None, optional (default=None) Control the randomization of the algorithm: - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. Attributes ---------- sampler_ : object The sampler used to balance the dataset. indices_ : ndarray, shape (n_samples, n_features) The indices of the samples selected during sampling. Examples -------- >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> from imblearn.datasets import make_imbalance >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> X, y = make_imbalance(iris.data, iris.target, class_dict) >>> import keras >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.models.Sequential() >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], ... activation='softmax')) >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', ... metrics=['accuracy']) >>> from imblearn.keras import BalancedBatchGenerator >>> from imblearn.under_sampling import NearMiss >>> training_generator = BalancedBatchGenerator( ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) >>> callback_history = model.fit_generator(generator=training_generator, ... epochs=10, verbose=0) """ # flag for keras sequence duck-typing use_sequence_api = True @_deprecate_positional_args def __init__( self, X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): if not HAS_KERAS: raise ImportError("'No module named 'keras'") self.X = X self.y = y self.sample_weight = sample_weight self.sampler = sampler self.batch_size = batch_size self.keep_sparse = keep_sparse self.random_state = random_state self._sample() def _sample(self): random_state = check_random_state(self.random_state) if self.sampler is None: self.sampler_ = RandomUnderSampler(random_state=random_state) else: self.sampler_ = clone(self.sampler) self.sampler_.fit_resample(self.X, self.y) if not hasattr(self.sampler_, "sample_indices_"): raise ValueError( "'sampler' needs to have an attribute " "'sample_indices_'." ) self.indices_ = self.sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_) def __len__(self): return int(self.indices_.size // self.batch_size) def __getitem__(self, index): X_resampled = _safe_indexing( self.X, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) y_resampled = _safe_indexing( self.y, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = _safe_indexing( self.sample_weight, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) if self.sample_weight is None: return X_resampled, y_resampled else: return X_resampled, y_resampled, sample_weight_resampled @Substitution(random_state=_random_state_docstring) @_deprecate_positional_args def balanced_batch_generator( X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which is given to ``fit_generator``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. Parameters ---------- X : ndarray, shape (n_samples, n_features) Original imbalanced dataset. y : ndarray, shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray, shape (n_samples,) Sample weight. sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. keep_sparse : bool, optional (default=False) Either or not to conserve or not the sparsity of the input (i.e. ``X``, ``y``, ``sample_weight``). By default, the returned batches will be dense. {random_state} Returns ------- generator : generator of tuple Generate batch of data. The tuple generated are either (X_batch, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int The number of samples per epoch. Required by ``fit_generator`` in keras. Examples -------- >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> from imblearn.datasets import make_imbalance >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> from imblearn.datasets import make_imbalance >>> X, y = make_imbalance(X, y, class_dict) >>> import keras >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.models.Sequential() >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], ... activation='softmax')) >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', ... metrics=['accuracy']) >>> from imblearn.keras import balanced_batch_generator >>> from imblearn.under_sampling import NearMiss >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) >>> callback_history = model.fit_generator(generator=training_generator, ... steps_per_epoch=steps_per_epoch, ... epochs=10, verbose=0) """ return tf_bbg( X=X, y=y, sample_weight=sample_weight, sampler=sampler, batch_size=batch_size, keep_sparse=keep_sparse, random_state=random_state, ) imbalanced-learn-0.7.0/imblearn/keras/tests/000077500000000000000000000000001366766276300207625ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/keras/tests/__init__.py000066400000000000000000000000001366766276300230610ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/keras/tests/test_generator.py000066400000000000000000000077001366766276300243650ustar00rootroot00000000000000import pytest import numpy as np from scipy import sparse from sklearn.datasets import load_iris keras = pytest.importorskip("keras") from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical from imblearn.datasets import make_imbalance from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss from imblearn.over_sampling import RandomOverSampler from imblearn.keras import BalancedBatchGenerator from imblearn.keras import balanced_batch_generator @pytest.fixture def data(): iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) return X, y def _build_keras_model(n_classes, n_features): model = Sequential() model.add(Dense(n_classes, input_dim=n_features, activation="softmax")) model.compile( optimizer="sgd", loss="categorical_crossentropy", metrics=["accuracy"] ) return model def test_balanced_batch_generator_class_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator( *data, sampler=ClusterCentroids(), batch_size=10 ) @pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", [ (None, None), (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=120)), ], ) def test_balanced_batch_generator_class(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator = BalancedBatchGenerator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42, ) model.fit_generator(generator=training_generator, epochs=10) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_class_sparse(data, keep_sparse): X, y = data training_generator = BalancedBatchGenerator( sparse.csr_matrix(X), y, batch_size=10, keep_sparse=keep_sparse, random_state=42, ) for idx in range(len(training_generator)): X_batch, _ = training_generator.__getitem__(idx) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) def test_balanced_batch_generator_function_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): balanced_batch_generator( *data, sampler=ClusterCentroids(), batch_size=10, random_state=42 ) @pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", [ (None, None), (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=120)), ], ) def test_balanced_batch_generator_function(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42, ) model.fit_generator( generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10, ) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42, ) for _ in range(steps_per_epoch): X_batch, _ = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) imbalanced-learn-0.7.0/imblearn/metrics/000077500000000000000000000000001366766276300201615ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/metrics/__init__.py000066400000000000000000000012371366766276300222750ustar00rootroot00000000000000""" The :mod:`imblearn.metrics` module includes score functions, performance metrics and pairwise metrics and distance computations. """ from ._classification import sensitivity_specificity_support from ._classification import sensitivity_score from ._classification import specificity_score from ._classification import geometric_mean_score from ._classification import make_index_balanced_accuracy from ._classification import classification_report_imbalanced __all__ = [ "sensitivity_specificity_support", "sensitivity_score", "specificity_score", "geometric_mean_score", "make_index_balanced_accuracy", "classification_report_imbalanced", ] imbalanced-learn-0.7.0/imblearn/metrics/_classification.py000066400000000000000000001022001366766276300236600ustar00rootroot00000000000000# coding: utf-8 """Metrics to assess performance on classification task given class prediction Functions named as ``*_score`` return a scalar value to maximize: the higher the better Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: the lower the better """ # Authors: Guillaume Lemaitre # Dariusz Brzezinski # License: MIT import functools import warnings import numpy as np import scipy as sp from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics._classification import _check_targets from sklearn.metrics._classification import _prf_divide from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels try: from inspect import signature except ImportError: from sklearn.externals.funcsigs import signature from ..utils._validation import _deprecate_positional_args @_deprecate_positional_args def sensitivity_specificity_support( y_true, y_pred, *, labels=None, pos_label=1, average=None, warn_for=("sensitivity", "specificity"), sample_weight=None, ): """Compute sensitivity, specificity, and support for each class The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives_[1]. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fn`` the number of false negatives. The specificity quantifies the ability to avoid false positives_[1]. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` is one of ``'weighted'``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : ndarray, shape (n_samples, ) Sample weights. Returns ------- sensitivity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) specificity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) support : int (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) The number of occurrences of each label in ``y_true``. References ---------- .. [1] `Wikipedia entry for the Sensitivity and specificity `_ Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_specificity_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> sensitivity_specificity_support(y_true, y_pred, average='macro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='micro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') (0.33333333333333331, 0.66666666666666663, None) """ average_options = (None, "micro", "macro", "weighted", "samples") if average not in average_options and average != "binary": raise ValueError("average has to be one of " + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) if average == "binary": if y_type == "binary": if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels return (0.0, 0.0, 0) else: raise ValueError( "pos_label=%r is not a valid label: %r" % (pos_label, present_labels) ) labels = [pos_label] else: raise ValueError( "Target is %s but average='binary'. Please " "choose another average setting." % y_type ) elif pos_label not in (None, 1): warnings.warn( "Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning, ) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d(present_labels, labels, assume_unique=True)] ) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith("multilabel"): raise ValueError("imblearn does not support multilabel") elif average == "samples": raise ValueError( "Sample-based precision, recall, fscore is " "not meaningful outside multilabel " "classification. See the accuracy_score instead." ) else: le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels) ) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = np.bincount( y_pred, weights=sample_weight, minlength=len(labels) ) if len(y_true): true_sum = np.bincount( y_true, weights=sample_weight, minlength=len(labels) ) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] if average == "micro": tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) tn_sum = np.array([tn_sum.sum()]) # Finally, we have all our sufficient statistics. Divide! # with np.errstate(divide="ignore", invalid="ignore"): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. specificity = _prf_divide( tn_sum, tn_sum + pred_sum - tp_sum, "specificity", "predicted", average, warn_for, ) sensitivity = _prf_divide( tp_sum, true_sum, "sensitivity", "true", average, warn_for ) # Average the results if average == "weighted": weights = true_sum if weights.sum() == 0: return 0, 0, None elif average == "samples": weights = sample_weight else: weights = None if average is not None: assert average != "binary" or len(specificity) == 1 specificity = np.average(specificity, weights=weights) sensitivity = np.average(sensitivity, weights=weights) true_sum = None # return no support return sensitivity, specificity, true_sum @_deprecate_positional_args def sensitivity_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, ): """Compute the sensitivity The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : ndarray, shape (n_samples, ) Sample weights. Returns ------- specificity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> sensitivity_score(y_true, y_pred, average='macro') 0.33333333333333331 >>> sensitivity_score(y_true, y_pred, average='micro') 0.33333333333333331 >>> sensitivity_score(y_true, y_pred, average='weighted') 0.33333333333333331 >>> sensitivity_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) """ s, _, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("sensitivity",), sample_weight=sample_weight, ) return s @_deprecate_positional_args def specificity_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, ): """Compute the specificity The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fp`` the number of false positives. The specificity quantifies the ability to avoid false positives. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : ndarray, shape (n_samples, ) Sample weights. Returns ------- specificity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) Examples -------- >>> import numpy as np >>> from imblearn.metrics import specificity_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> specificity_score(y_true, y_pred, average='macro') 0.66666666666666663 >>> specificity_score(y_true, y_pred, average='micro') 0.66666666666666663 >>> specificity_score(y_true, y_pred, average='weighted') 0.66666666666666663 >>> specificity_score(y_true, y_pred, average=None) array([ 0.75, 0.5 , 0.75]) """ _, s, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("specificity",), sample_weight=sample_weight, ) return s @_deprecate_positional_args def geometric_mean_score( y_true, y_pred, *, labels=None, pos_label=1, average="multiclass", sample_weight=None, correction=0.0, ): """Compute the geometric mean. The geometric mean (G-mean) is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. For binary classification G-mean is the squared root of the product of the sensitivity and specificity. For multi-class problems it is a higher root of the product of sensitivity for each class. For compatibility with other imbalance performance measures, G-mean can be calculated for each class separately on a one-vs-rest basis when ``average != 'multiclass'``. The best value is 1 and the worst value is 0. Traditionally if at least one class is unrecognized by the classifier, G-mean resolves to zero. To alleviate this property, for highly multi-class the sensitivity of unrecognized classes can be "corrected" to be a user specified value (instead of zero). This option works only if ``average == 'multiclass'``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default='multiclass') If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : ndarray, shape (n_samples, ) Sample weights. correction: float, optional (default=0.0) Substitutes sensitivity of unrecognized classes from zero to a given value. Returns ------- geometric_mean : float Notes ----- See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`. References ---------- .. [1] Kubat, M. and Matwin, S. "Addressing the curse of imbalanced training sets: one-sided selection" ICML (1997) .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies for learning in class imbalance problems", Pattern Recognition, 36(3), (2003), pp 849-851. Examples -------- >>> from imblearn.metrics import geometric_mean_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> geometric_mean_score(y_true, y_pred) 0.0 >>> geometric_mean_score(y_true, y_pred, correction=0.001) 0.010000000000000004 >>> geometric_mean_score(y_true, y_pred, average='macro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='micro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='weighted') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average=None) array([ 0.8660254, 0. , 0. ]) """ if average is None or average != "multiclass": sen, spe, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("specificity", "specificity"), sample_weight=sample_weight, ) return np.sqrt(sen * spe) else: present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [ labels, np.setdiff1d(present_labels, labels, assume_unique=True), ] ) le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels) ) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): true_sum = np.bincount( y_true, weights=sample_weight, minlength=len(labels) ) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] with np.errstate(divide="ignore", invalid="ignore"): recall = _prf_divide( tp_sum, true_sum, "recall", "true", None, "recall" ) recall[recall == 0] = correction with np.errstate(divide="ignore", invalid="ignore"): gmean = sp.stats.gmean(recall) # old version of scipy return MaskedConstant instead of 0.0 if isinstance(gmean, np.ma.core.MaskedConstant): return 0.0 return gmean @_deprecate_positional_args def make_index_balanced_accuracy(*, alpha=0.1, squared=True): """Balance any scoring function using the index balanced accuracy This factory function wraps scoring function to express it as the index balanced accuracy (IBA). You need to use this function to decorate any scoring function. Only metrics requiring ``y_pred`` can be corrected with the index balanced accuracy. ``y_score`` cannot be used since the dominance cannot be computed. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, optional (default=0.1) Weighting factor. squared : bool, optional (default=True) If ``squared`` is True, then the metric computed will be squared before to be weighted. Returns ------- iba_scoring_func : callable, Returns the scoring metric decorated which will automatically compute the index balanced accuracy. Notes ----- See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`. References ---------- .. [1] García, Vicente, Javier Salvador Sánchez, and Ramón Alberto Mollineda. "On the effectiveness of preprocessing methods when dealing with different levels of class imbalance." Knowledge-Based Systems 25.1 (2012): 13-21. Examples -------- >>> from imblearn.metrics import geometric_mean_score as gmean >>> from imblearn.metrics import make_index_balanced_accuracy as iba >>> gmean = iba(alpha=0.1, squared=True)(gmean) >>> y_true = [1, 0, 0, 1, 0, 1] >>> y_pred = [0, 0, 1, 1, 0, 1] >>> print(gmean(y_true, y_pred, average=None)) [ 0.44444444 0.44444444] """ def decorate(scoring_func): @functools.wraps(scoring_func) def compute_score(*args, **kwargs): signature_scoring_func = signature(scoring_func) params_scoring_func = set(signature_scoring_func.parameters.keys()) # check that the scoring function does not need a score # and only a prediction prohibitied_y_pred = set(["y_score", "y_prob", "y2"]) if prohibitied_y_pred.intersection(params_scoring_func): raise AttributeError( "The function {} has an unsupported" " attribute. Metric with`y_pred` are the" " only supported metrics is the only" " supported.".format(scoring_func.__name__) ) args_scoring_func = signature_scoring_func.bind(*args, **kwargs) args_scoring_func.apply_defaults() _score = scoring_func( *args_scoring_func.args, **args_scoring_func.kwargs ) if squared: _score = np.power(_score, 2) signature_sens_spec = signature(sensitivity_specificity_support) params_sens_spec = set(signature_sens_spec.parameters.keys()) common_params = params_sens_spec.intersection( set(args_scoring_func.arguments.keys()) ) args_sens_spec = { k: args_scoring_func.arguments[k] for k in common_params } if scoring_func.__name__ == "geometric_mean_score": if "average" in args_sens_spec: if args_sens_spec["average"] == "multiclass": args_sens_spec["average"] = "macro" elif ( scoring_func.__name__ == "accuracy_score" or scoring_func.__name__ == "jaccard_score" ): # We do not support multilabel so the only average supported # is binary args_sens_spec["average"] = "binary" sensitivity, specificity, _ = sensitivity_specificity_support( **args_sens_spec ) dominance = sensitivity - specificity return (1.0 + alpha * dominance) * _score return compute_score return decorate @_deprecate_positional_args def classification_report_imbalanced( y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, alpha=0.1, ): """Build a classification report based on metrics used with imbalanced dataset Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. target_names : list of strings, optional Optional display names matching the labels (same order). sample_weight : ndarray, shape (n_samples, ) Sample weights. digits : int, optional (default=2) Number of digits for formatting output floating point values alpha : float, optional (default=0.1) Weighting factor. Returns ------- report : string Text summary of the precision, recall, specificity, geometric mean, and index balanced accuracy. Examples -------- >>> import numpy as np >>> from imblearn.metrics import classification_report_imbalanced >>> y_true = [0, 1, 2, 2, 2] >>> y_pred = [0, 0, 2, 2, 1] # doctest : +NORMALIZE_WHITESPACE >>> target_names = ['class 0', 'class 1', \ 'class 2'] # doctest : +NORMALIZE_WHITESPACE >>> print(classification_report_imbalanced(y_true, y_pred, \ target_names=target_names)) pre rec spe f1 geo iba\ sup class 0 0.50 1.00 0.75 0.67 0.87 0.77\ 1 class 1 0.00 0.00 0.75 0.00 0.00 0.00\ 1 class 2 1.00 0.67 1.00 0.80 0.82 0.64\ 3 avg / total 0.70 0.60 0.90 0.61 0.66 0.54\ 5 """ if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) last_line_heading = "avg / total" if target_names is None: target_names = ["%s" % l for l in labels] name_width = max(len(cn) for cn in target_names) width = max(name_width, len(last_line_heading), digits) headers = ["pre", "rec", "spe", "f1", "geo", "iba", "sup"] fmt = "%% %ds" % width # first column: class name fmt += " " fmt += " ".join(["% 9s" for _ in headers]) fmt += "\n" headers = [""] + headers report = fmt % tuple(headers) report += "\n" # Compute the different metrics # Precision/recall/f1 precision, recall, f1, support = precision_recall_fscore_support( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) # Specificity specificity = specificity_score( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) # Geometric mean geo_mean = geometric_mean_score( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) # Index balanced accuracy iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)( geometric_mean_score ) iba = iba_gmean( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) for i, label in enumerate(labels): values = [target_names[i]] for v in ( precision[i], recall[i], specificity[i], f1[i], geo_mean[i], iba[i], ): values += ["{0:0.{1}f}".format(v, digits)] values += ["{}".format(support[i])] report += fmt % tuple(values) report += "\n" # compute averages values = [last_line_heading] for v in ( np.average(precision, weights=support), np.average(recall, weights=support), np.average(specificity, weights=support), np.average(f1, weights=support), np.average(geo_mean, weights=support), np.average(iba, weights=support), ): values += ["{0:0.{1}f}".format(v, digits)] values += ["{}".format(np.sum(support))] report += fmt % tuple(values) return report imbalanced-learn-0.7.0/imblearn/metrics/tests/000077500000000000000000000000001366766276300213235ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/metrics/tests/test_classification.py000066400000000000000000000366221366766276300257400ustar00rootroot00000000000000# coding: utf-8 """Testing the metric for classification with imbalanced dataset""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from functools import partial import numpy as np import pytest from sklearn import datasets from sklearn import svm from sklearn.preprocessing import label_binarize from sklearn.utils.fixes import np_version from sklearn.utils.validation import check_random_state from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_no_warnings from sklearn.metrics import accuracy_score, average_precision_score from sklearn.metrics import brier_score_loss, cohen_kappa_score from sklearn.metrics import jaccard_score, precision_score from sklearn.metrics import recall_score, roc_auc_score from imblearn.metrics import sensitivity_specificity_support from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score from imblearn.metrics import geometric_mean_score from imblearn.metrics import make_index_balanced_accuracy from imblearn.metrics import classification_report_imbalanced from imblearn.utils.testing import warns RND_SEED = 42 R_TOL = 1e-2 ############################################################################### # Utilities for testing def make_prediction(dataset=None, binary=False): """Make some classification predictions on a toy dataset using a SVC If binary is True restrict to a binary classification problem instead of a multiclass classification problem """ if dataset is None: # import some data to play with dataset = datasets.load_iris() X = dataset.data y = dataset.target if binary: # restrict to a binary classification task X, y = X[y < 2], y[y < 2] n_samples, n_features = X.shape p = np.arange(n_samples) rng = check_random_state(37) rng.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # add noisy features to make the problem harder and avoid perfect results rng = np.random.RandomState(0) X = np.c_[X, rng.randn(n_samples, 200 * n_features)] # run classifier, get class probabilities and label predictions clf = svm.SVC(kernel="linear", probability=True, random_state=0) probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) if binary: # only interested in probabilities of the positive case # XXX: do we really want a special API for the binary case? probas_pred = probas_pred[:, 1] y_pred = clf.predict(X[half:]) y_true = y[half:] return y_true, y_pred, probas_pred ############################################################################### # Tests def test_sensitivity_specificity_score_binary(): y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class sen, spe, sup = sensitivity_specificity_support( y_true, y_pred, average=None ) assert_allclose(sen, [0.88, 0.68], rtol=R_TOL) assert_allclose(spe, [0.68, 0.88], rtol=R_TOL) assert_array_equal(sup, [25, 25]) # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. for kwargs in ({}, {"average": "binary"}): sen = assert_no_warnings(sensitivity_score, y_true, y_pred, **kwargs) assert sen == pytest.approx(0.68, rel=R_TOL) spe = assert_no_warnings(specificity_score, y_true, y_pred, **kwargs) assert spe == pytest.approx(0.88, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Specificity is ill-defined") @pytest.mark.parametrize( "y_pred, expected_sensitivity, expected_specificity", [(([1, 1], [1, 1]), 1.0, 0.0), (([-1, -1], [-1, -1]), 0.0, 0.0)], ) def test_sensitivity_specificity_f_binary_single_class( y_pred, expected_sensitivity, expected_specificity ): # Such a case may occur with non-stratified cross-validation assert sensitivity_score(*y_pred) == expected_sensitivity assert specificity_score(*y_pred) == expected_specificity @pytest.mark.parametrize( "average, expected_specificty", [ (None, [1.0, 0.67, 1.0, 1.0, 1.0]), ("macro", np.mean([1.0, 0.67, 1.0, 1.0, 1.0])), ("micro", 15 / 16), ], ) def test_sensitivity_specificity_extra_labels(average, expected_specificty): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] actual = specificity_score( y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average ) assert_allclose(expected_specificty, actual, rtol=R_TOL) def test_sensitivity_specificity_ignored_labels(): y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3]) specificity_all = partial(specificity_score, y_true, y_pred, labels=None) assert_allclose([1.0, 0.33], specificity_13(average=None), rtol=R_TOL) assert_allclose( np.mean([1.0, 0.33]), specificity_13(average="macro"), rtol=R_TOL ) assert_allclose( np.average([1.0, 0.33], weights=[2.0, 1.0]), specificity_13(average="weighted"), rtol=R_TOL, ) assert_allclose( 3.0 / (3.0 + 2.0), specificity_13(average="micro"), rtol=R_TOL ) # ensure the above were meaningful tests: for each in ["macro", "weighted", "micro"]: assert specificity_13(average=each) != specificity_all(average=each) def test_sensitivity_specificity_error_multilabels(): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] y_true_bin = label_binarize(y_true, classes=np.arange(5)) y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) with pytest.raises(ValueError): sensitivity_score(y_true_bin, y_pred_bin) def test_sensitivity_specificity_support_errors(): y_true, y_pred, _ = make_prediction(binary=True) # Bad pos_label with pytest.raises(ValueError): sensitivity_specificity_support( y_true, y_pred, pos_label=2, average="binary" ) # Bad average option with pytest.raises(ValueError): sensitivity_specificity_support([0, 1, 2], [1, 2, 0], average="mega") def test_sensitivity_specificity_unused_pos_label(): # but average != 'binary'; even if data is binary with warns(UserWarning, r"use labels=\[pos_label\] to specify a single"): sensitivity_specificity_support( [1, 2, 1], [1, 2, 2], pos_label=2, average="macro" ) def test_geometric_mean_support_binary(): y_true, y_pred, _ = make_prediction(binary=True) # compute the geometric mean for the binary problem geo_mean = geometric_mean_score(y_true, y_pred) assert_allclose(geo_mean, 0.77, rtol=R_TOL) @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, correction, expected_gmean", [ ([0, 0, 1, 1], [0, 0, 1, 1], 0.0, 1.0), ([0, 0, 0, 0], [1, 1, 1, 1], 0.0, 0.0), ([0, 0, 0, 0], [0, 0, 0, 0], 0.001, 1.0), ([0, 0, 0, 0], [1, 1, 1, 1], 0.001, 0.001), ([0, 0, 1, 1], [0, 1, 1, 0], 0.001, 0.5), ( [0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 0.001, (0.001 ** 2) ** (1 / 3), ), ([0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], 0.001, 1), ([0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], 0.001, (0.5 * 0.75) ** 0.5), ], ) def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, correction=correction) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, average, expected_gmean", [ ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "macro", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "micro", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "weighted", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], None, [0.8660254, 0.0, 0.0]), ], ) def test_geometric_mean_average(y_true, y_pred, average, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, average=average) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.parametrize( "y_true, y_pred, sample_weight, average, expected_gmean", [ ([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], None, "multiclass", 0.707), ( [0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], "multiclass", 0.707, ), ( [0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], "weighted", 0.333, ), ], ) def test_geometric_mean_sample_weight( y_true, y_pred, sample_weight, average, expected_gmean ): gmean = geometric_mean_score( y_true, y_pred, labels=[0, 1], sample_weight=sample_weight, average=average, ) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.parametrize( "average, expected_gmean", [ ("multiclass", 0.41), (None, [0.85, 0.29, 0.7]), ("macro", 0.68), ("weighted", 0.65), ], ) def test_geometric_mean_score_prediction(average, expected_gmean): y_true, y_pred, _ = make_prediction(binary=False) gmean = geometric_mean_score(y_true, y_pred, average=average) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) def test_iba_geo_mean_binary(): y_true, y_pred, _ = make_prediction(binary=True) iba_gmean = make_index_balanced_accuracy(alpha=0.5, squared=True)( geometric_mean_score ) iba = iba_gmean(y_true, y_pred) assert_allclose(iba, 0.5948, rtol=R_TOL) def _format_report(report): return " ".join(report.split()) def test_classification_report_imbalanced_multiclass(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ( "pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 " "0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 " "0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 " "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ( "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_digits(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ( "pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " "0.92157 0.80851 0.85415 0.72010 24 versicolor " "0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 " "31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 " "0.50831 20 avg / total 0.51375 0.53333 0.79733 " "0.47310 0.57966 0.39788 75" ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ( "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 " "0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " "0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 " "red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report expected_report = ( "pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 " "0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 " "0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 " "0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced( y_true, y_pred, target_names=["a", "b", "c"] ) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue\xa2", "green\xa2", "red\xa2"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 " "0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 " "red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) if np_version[:3] < (1, 7, 0): with pytest.raises(RuntimeError, match="NumPy < 1.7.0"): classification_report_imbalanced(y_true, y_pred) else: report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_long_string_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue", "green" * 5, "red"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " "0.85 0.72 24 greengreengreengreengreen 0.33 0.10 " "0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 " "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report @pytest.mark.parametrize( "score, expected_score", [ (accuracy_score, 0.54756), (jaccard_score, 0.33176), (precision_score, 0.65025), (recall_score, 0.41616), ], ) def test_iba_sklearn_metrics(score, expected_score): y_true, y_pred, _ = make_prediction(binary=True) score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score) score = score_iba(y_true, y_pred) assert score == pytest.approx(expected_score) @pytest.mark.parametrize( "score_loss", [ average_precision_score, brier_score_loss, cohen_kappa_score, roc_auc_score, ], ) def test_iba_error_y_score_prob_error(score_loss): y_true, y_pred, _ = make_prediction(binary=True) aps = make_index_balanced_accuracy(alpha=0.5, squared=True)(score_loss) with pytest.raises(AttributeError): aps(y_true, y_pred) imbalanced-learn-0.7.0/imblearn/metrics/tests/test_score_objects.py000066400000000000000000000045001366766276300255570ustar00rootroot00000000000000"""Test for score""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest from sklearn.datasets import make_blobs from sklearn.metrics import make_scorer from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score from imblearn.metrics import geometric_mean_score from imblearn.metrics import make_index_balanced_accuracy R_TOL = 1e-2 @pytest.fixture def data(): X, y = make_blobs(random_state=0, centers=2) return train_test_split(X, y, random_state=0) @pytest.mark.filterwarnings("ignore:Liblinear failed to converge") @pytest.mark.parametrize( "score, expected_score", [ (sensitivity_score, 0.92), (specificity_score, 0.92), (geometric_mean_score, 0.92), (make_index_balanced_accuracy()(geometric_mean_score), 0.85), ], ) @pytest.mark.parametrize("average", ["macro", "weighted", "micro"]) def test_scorer_common_average(data, score, expected_score, average): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=None, average=average) grid = GridSearchCV( LinearSVC(random_state=0), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Liblinear failed to converge") @pytest.mark.parametrize( "score, average, expected_score", [ (sensitivity_score, "binary", 0.92), (specificity_score, "binary", 0.95), (geometric_mean_score, "multiclass", 0.92), ( make_index_balanced_accuracy()(geometric_mean_score), "multiclass", 0.84, ), ], ) def test_scorer_default_average(data, score, average, expected_score): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=1, average=average) grid = GridSearchCV( LinearSVC(random_state=0), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL) imbalanced-learn-0.7.0/imblearn/over_sampling/000077500000000000000000000000001366766276300213605ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/over_sampling/__init__.py000066400000000000000000000007211366766276300234710ustar00rootroot00000000000000""" The :mod:`imblearn.over_sampling` provides a set of method to perform over-sampling. """ from ._adasyn import ADASYN from ._random_over_sampler import RandomOverSampler from ._smote import SMOTE from ._smote import BorderlineSMOTE from ._smote import KMeansSMOTE from ._smote import SVMSMOTE from ._smote import SMOTENC __all__ = [ "ADASYN", "RandomOverSampler", "KMeansSMOTE", "SMOTE", "BorderlineSMOTE", "SVMSMOTE", "SMOTENC", ] imbalanced-learn-0.7.0/imblearn/over_sampling/_adasyn.py000066400000000000000000000143021366766276300233500ustar00rootroot00000000000000"""Class to perform over-sampling using ADASYN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from scipy import sparse from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from .base import BaseOverSampler from ..utils import check_neighbors_object from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class ADASYN(BaseOverSampler): """Oversample using Adaptive Synthetic (ADASYN) algorithm. This method is similar to SMOTE but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int int or object, optional (default=5) If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} See Also -------- SMOTE : Over-sample using SMOTE. Notes ----- The implementation is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used. References ---------- .. [1] He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. "ADASYN: Adaptive synthetic sampling approach for imbalanced learning," In IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence), pp. 1322-1328, 2008. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ ADASYN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, ... random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ada = ADASYN(random_state=42) >>> X_res, y_res = ada.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 904, 1: 900}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Create the necessary objects for ADASYN""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_.fit(X) nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] # The ratio is computed using a one-vs-rest manner. Using majority # in multi-class would lead to slightly different results at the # cost of introducing a new parameter. n_neighbors = self.nn_.n_neighbors - 1 ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors if not np.sum(ratio_nn): raise RuntimeError( "Not any neigbours belong to the majority" " class. This case will induce a NaN case" " with a division by zero. ADASYN is not" " suited for this specific dataset." " Use SMOTE instead." ) ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) # rounding may cause new amount for n_samples n_samples = np.sum(n_samples_generate) if not n_samples: raise ValueError( "No samples will be generated with the" " provided ratio settings." ) # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples self.nn_.fit(X_class) nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] enumerated_class_indices = np.arange(len(target_class_indices)) rows = np.repeat(enumerated_class_indices, n_samples_generate) cols = random_state.choice(n_neighbors, size=n_samples) diffs = X_class[nns[rows, cols]] - X_class[rows] steps = random_state.uniform(size=(n_samples, 1)) if sparse.issparse(X): sparse_func = type(X).__name__ steps = getattr(sparse, sparse_func)(steps) X_new = X_class[rows] + steps.multiply(diffs) else: X_new = X_class[rows] + steps * diffs X_new = X_new.astype(X.dtype) y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled imbalanced-learn-0.7.0/imblearn/over_sampling/_random_over_sampler.py000066400000000000000000000067451366766276300261430ustar00rootroot00000000000000"""Class to perform random over-sampling.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from .base import BaseOverSampler from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RandomOverSampler(BaseOverSampler): """Class to perform random over-sampling. Object to over-sample the minority class(es) by picking samples at random with replacement. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- SMOTE : Oversample by generating synthetic samples. Notes ----- Supports multi-class resampling by sampling each class independently. Supports heterogeneous data as object array containing string and numeric data. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ RandomOverSampler # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ros = RandomOverSampler(random_state=42) >>> X_res, y_res = ros.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ @_deprecate_positional_args def __init__(self, *, sampling_strategy="auto", random_state=None): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False, ) return X, y, binarize_y def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) sample_indices = range(X.shape[0]) for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) indices = random_state.randint( low=0, high=target_stats[class_sample], size=num_samples ) sample_indices = np.append( sample_indices, target_class_indices[indices] ) self.sample_indices_ = np.array(sample_indices) return ( _safe_indexing(X, sample_indices), _safe_indexing(y, sample_indices), ) def _more_tags(self): return { "X_types": ["2darray", "string"], "sample_indices": True, "allow_nan": True, } imbalanced-learn-0.7.0/imblearn/over_sampling/_smote.py000066400000000000000000001367071366766276300232360ustar00rootroot00000000000000"""Class to perform over-sampling using SMOTE.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # Dzianis Dudnik # License: MIT import math from collections import Counter import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.cluster import MiniBatchKMeans from sklearn.metrics import pairwise_distances from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from sklearn.utils import check_array from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0 from .base import BaseOverSampler from ..exceptions import raise_isinstance_error from ..utils import check_neighbors_object from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args class BaseSMOTE(BaseOverSampler): """Base class for the different SMOTE algorithms.""" def __init__( self, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.k_neighbors = k_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Check the NN estimators shared across the different SMOTE algorithms. """ self.nn_k_ = check_neighbors_object( "k_neighbors", self.k_neighbors, additional_neighbor=1 ) def _make_samples( self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0 ): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Points from which the points will be created. y_dtype : dtype The data type of the targets. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. n_samples : int The number of samples to generate. step_size : float, default=1.0 The step size to create samples. Returns ------- X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray of shape (n_samples_new,) Target values for synthetic samples. """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint( low=0, high=nn_num.size, size=n_samples ) # np.newaxis for backwards compatability with random_state steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis] rows = np.floor_divide(samples_indices, nn_num.shape[1]) cols = np.mod(samples_indices, nn_num.shape[1]) X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps) y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) return X_new, y_new def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): r"""Generate a synthetic sample. The rule for the generation is: .. math:: \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times (\mathbf{s_{i}} - \mathbf{s_{nn}}) \, where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Points from which the points will be created. nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. rows : ndarray of shape (n_samples,), dtype=int Indices pointing at feature vector in X which will be used as a base for creating new samples. cols : ndarray of shape (n_samples,), dtype=int Indices pointing at which nearest neighbor of base feature vector will be used when creating new samples. steps : ndarray of shape (n_samples,), dtype=float Step sizes for new samples. Returns ------- X_new : {ndarray, sparse matrix} of shape (n_samples, n_features) Synthetically generated samples. """ diffs = nn_data[nn_num[rows, cols]] - X[rows] if sparse.issparse(X): sparse_func = type(X).__name__ steps = getattr(sparse, sparse_func)(steps) X_new = X[rows] + steps.multiply(diffs) else: X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) def _in_danger_noise( self, nn_estimator, samples, target_class, y, kind="danger" ): """Estimate if a set of sample are in danger or noise. Used by BorderlineSMOTE and SVMSMOTE. Parameters ---------- nn_estimator : estimator An estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` use to determine if a sample is in danger/noise. samples : {array-like, sparse matrix} of shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str The target corresponding class being over-sampled. y : array-like of shape (n_samples,) The true label in order to check the neighbour labels. kind : {'danger', 'noise'}, default='danger' The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray of shape (n_samples,) A boolean array where True refer to samples in danger or noise. """ x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:] nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) if kind == "danger": # Samples are in danger for m/2 <= m' < m return np.bitwise_and( n_maj >= (nn_estimator.n_neighbors - 1) / 2, n_maj < nn_estimator.n_neighbors - 1, ) elif kind == "noise": # Samples are noise for m = m' return n_maj == nn_estimator.n_neighbors - 1 else: raise NotImplementedError @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BorderlineSMOTE(BaseSMOTE): """Over-sampling using Borderline SMOTE. This algorithm is a variant of the original SMOTE algorithm proposed in [2]_. Borderline samples will be detected and used to generate new synthetic samples. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} m_neighbors : int or object, default=10 If int, number of nearest neighbours to use to determine if a minority sample is in danger. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the m_neighbors. kind : {{"borderline-1", "borderline-2"}}, default='borderline-1' The type of SMOTE algorithm to use one of the following options: ``'borderline-1'``, ``'borderline-2'``. See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SVMSMOTE : Over-sample using SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [2]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ BorderlineSMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = BorderlineSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, m_neighbors=10, kind="borderline-1", ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.m_neighbors = m_neighbors self.kind = kind def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) if self.kind not in ("borderline-1", "borderline-2"): raise ValueError( 'The possible "kind" of algorithm are ' '"borderline-1" and "borderline-2".' "Got {} instead.".format(self.kind) ) def _fit_resample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_m_.fit(X) danger_index = self._in_danger_noise( self.nn_m_, X_class, class_sample, y, kind="danger" ) if not any(danger_index): continue self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( _safe_indexing(X_class, danger_index), return_distance=False )[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == "borderline-1": # Create synthetic samples for borderline points. X_new, y_new = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, nns, n_samples, ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) elif self.kind == "borderline-2": random_state = check_random_state(self.random_state) fractions = random_state.beta(10, 10) # only minority X_new_1, y_new_1 = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.0, ) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, _safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), step_size=0.5, ) if sparse.issparse(X_resampled): X_resampled = sparse.vstack( [X_resampled, X_new_1, X_new_2] ) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) return X_resampled, y_resampled @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SVMSMOTE(BaseSMOTE): """Over-sampling using SVM-SMOTE. Variant of SMOTE algorithm which use an SVM algorithm to detect sample to use for generating new synthetic samples as proposed in [2]_. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} m_neighbors : int or object, default=10 If int, number of nearest neighbours to use to determine if a minority sample is in danger. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the m_neighbors. svm_estimator : object, default=SVC() A parametrized :class:`sklearn.svm.SVC` classifier can be passed. out_step : float, default=0.5 Step size when extrapolating. See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. BorderlineSMOTE : Over-sample using Borderline-SMOTE. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [2]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ SVMSMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SVMSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, m_neighbors=10, svm_estimator=None, out_step=0.5, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.m_neighbors = m_neighbors self.svm_estimator = svm_estimator self.out_step = out_step def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) if self.svm_estimator is None: self.svm_estimator_ = SVC( gamma="scale", random_state=self.random_state ) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = clone(self.svm_estimator) else: raise_isinstance_error("svm_estimator", [SVC], self.svm_estimator) def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample ] support_vector = _safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise( self.nn_m_, support_vector, class_sample, y, kind="noise" ) support_vector = _safe_indexing( support_vector, np.flatnonzero(np.logical_not(noise_bool)) ) danger_bool = self._in_danger_noise( self.nn_m_, support_vector, class_sample, y, kind="danger" ) safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) n_generated_samples = int(fractions * (n_samples + 1)) if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), return_distance=False, )[:, 1:] X_new_1, y_new_1 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), y.dtype, class_sample, X_class, nns, n_generated_samples, step_size=1.0, ) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), return_distance=False, )[:, 1:] X_new_2, y_new_2 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), y.dtype, class_sample, X_class, nns, n_samples - n_generated_samples, step_size=-self.out_step, ) if ( np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0 ): if sparse.issparse(X_resampled): X_resampled = sparse.vstack( [X_resampled, X_new_1, X_new_2] ) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.concatenate( (y_resampled, y_new_1, y_new_2), axis=0 ) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_1]) else: X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTE(BaseSMOTE): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique as presented in [1]_. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} See Also -------- SMOTENC : Over-sample using SMOTE for continuous and categorical features. BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. SVMSMOTE : Over-sample using the SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [1]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ SMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) def _fit_resample(self, X, y): self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples( X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 ) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled # @Substitution( # sampling_strategy=BaseOverSampler._sampling_strategy_docstring, # random_state=_random_state_docstring) class SMOTENC(SMOTE): """Synthetic Minority Over-sampling Technique for Nominal and Continuous. Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and categorical features. However, it is not designed to work with only categorical features. Read more in the :ref:`User Guide `. Parameters ---------- categorical_features : ndarray of shape (n_cat_features,) or (n_features,) Specified which features are categorical. Can either be: - array of indices specifying the categorical features; - mask array of shape (n_features, ) and ``bool`` dtype for which ``True`` indicates the categorical features. sampling_strategy : float, str, dict or callable, default='auto' Sampling information to resample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the number of samples in the minority class after resampling and :math:`N_{M}` is the number of samples in the majority class. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'minority'``: resample only the minority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not majority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. random_state : int, RandomState instance, default=None Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. k_neighbors : int or object, default=5 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. n_jobs : int, default=None Number of CPU cores used during the cross-validation loop. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See `Glossary `_ for more details. See Also -------- SMOTE : Over-sample using SMOTE. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original paper [1]_ for more details. Supports mutli-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. See :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`, and :ref:`sphx_glr_auto_examples_over-sampling_plot_illustration_generation_sample.py`. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. Examples -------- >>> from collections import Counter >>> from numpy.random import RandomState >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import SMOTENC >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape (%s, %s)' % X.shape) Original dataset shape (1000, 20) >>> print('Original dataset samples per class {}'.format(Counter(y))) Original dataset samples per class Counter({1: 900, 0: 100}) >>> # simulate the 2 last columns to be categorical features >>> X[:, -2:] = RandomState(10).randint(0, 4, size=(1000, 2)) >>> sm = SMOTENC(random_state=42, categorical_features=[18, 19]) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset samples per class {}'.format(Counter(y_res))) Resampled dataset samples per class Counter({0: 900, 1: 900}) """ _required_parameters = ["categorical_features"] @_deprecate_positional_args def __init__( self, categorical_features, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, ) self.categorical_features = categorical_features def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"] ) return X, y, binarize_y def _validate_estimator(self): super()._validate_estimator() categorical_features = np.asarray(self.categorical_features) if categorical_features.dtype.name == "bool": self.categorical_features_ = np.flatnonzero(categorical_features) else: if any( [ cat not in np.arange(self.n_features_) for cat in categorical_features ] ): raise ValueError( "Some of the categorical indices are out of range. Indices" " should be between 0 and {}".format(self.n_features_) ) self.categorical_features_ = categorical_features self.continuous_features_ = np.setdiff1d( np.arange(self.n_features_), self.categorical_features_ ) if self.categorical_features_.size == self.n_features_in_: raise ValueError( "SMOTE-NC is not designed to work only with categorical " "features. It requires some numerical features." ) def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing( X_continuous, np.flatnonzero(y == class_minority) ) if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder( sparse=True, handle_unknown="ignore", dtype=dtype_ohe ) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_minority_encoded = _safe_indexing( X_ohe.toarray(), np.flatnonzero(y == class_minority) ) X_ohe.data = ( np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 ) X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, : self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack( ( X_resampled[:, : self.continuous_features_.size].toarray(), X_res_cat_dec, ) ) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_)) ) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): """Generate a synthetic sample with an additional steps for the categorical features. Each new sample is generated the same way than in SMOTE. However, the categorical features are mapped to the most frequent nearest neighbors of the majority class. """ rng = check_random_state(self.random_state) X_new = super()._generate_samples( X, nn_data, nn_num, rows, cols, steps ) # change in sparsity structure more efficient with LIL than CSR X_new = (X_new.tolil() if sparse.issparse(X_new) else X_new) # convert to dense array since scipy.sparse doesn't handle 3D nn_data = (nn_data.toarray() if sparse.issparse(nn_data) else nn_data) # In the case that the median std was equal to zeros, we have to # create non-null entry based on the encoded of OHE if math.isclose(self.median_std_, 0): nn_data[:, self.continuous_features_.size:] = ( self._X_categorical_minority_encoded ) all_neighbors = nn_data[nn_num[rows]] categories_size = [self.continuous_features_.size] + [ cat.size for cat in self.ohe_.categories_ ] for start_idx, end_idx in zip(np.cumsum(categories_size)[:-1], np.cumsum(categories_size)[1:]): col_maxs = all_neighbors[:, :, start_idx:end_idx].sum(axis=1) # tie breaking argmax is_max = np.isclose(col_maxs, col_maxs.max(axis=1, keepdims=True)) max_idxs = rng.permutation(np.argwhere(is_max)) xs, idx_sels = np.unique(max_idxs[:, 0], return_index=True) col_sels = max_idxs[idx_sels, 1] ys = start_idx + col_sels X_new[:, start_idx:end_idx] = 0 X_new[xs, ys] = 1 return X_new @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class KMeansSMOTE(BaseSMOTE): """Apply a KMeans clustering before to over-sample using SMOTE. This is an implementation of the algorithm described in [1]_. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=2 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} kmeans_estimator : int or object, default=None A KMeans instance or the number of clusters to be used. By default, we used a :class:`sklearn.cluster.MiniBatchKMeans` which tend to be better with large number of samples. cluster_balance_threshold : "auto" or float, default="auto" The threshold at which a cluster is called balanced and where samples of the class selected for SMOTE will be oversampled. If "auto", this will be determined by the ratio for each class, or it can be set manually. density_exponent : "auto" or float, default="auto" This exponent is used to determine the density of a cluster. Leaving this to "auto" will use a feature-length based exponent. Attributes ---------- kmeans_estimator_ : estimator The fitted clustering method used before to apply SMOTE. nn_k_ : estimator The fitted k-NN estimator used in SMOTE. cluster_balance_threshold_ : float The threshold used during ``fit`` for calling a cluster balanced. See Also -------- SMOTE : Over-sample using SMOTE. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. ADASYN : Over-sample using ADASYN. References ---------- .. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE" https://arxiv.org/abs/1711.00837 Examples -------- >>> import numpy as np >>> from imblearn.over_sampling import KMeansSMOTE >>> from sklearn.datasets import make_blobs >>> blobs = [100, 800, 100] >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) >>> # Add a single 0 sample in the middle blob >>> X = np.concatenate([X, [[0, 0]]]) >>> y = np.append(y, 0) >>> # Make this a binary classification problem >>> y = y == 1 >>> sm = KMeansSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> # Find the number of new samples in the middle blob >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() >>> print("Samples in the middle blob: %s" % n_res_in_middle) Samples in the middle blob: 801 >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) Middle blob unchanged: True >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum())) More 0 samples: True """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=2, n_jobs=None, kmeans_estimator=None, cluster_balance_threshold="auto", density_exponent="auto", ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent def _validate_estimator(self): super()._validate_estimator() if self.kmeans_estimator is None: self.kmeans_estimator_ = MiniBatchKMeans( random_state=self.random_state ) elif isinstance(self.kmeans_estimator, int): self.kmeans_estimator_ = MiniBatchKMeans( n_clusters=self.kmeans_estimator, random_state=self.random_state, ) else: self.kmeans_estimator_ = clone(self.kmeans_estimator) # validate the parameters for param_name in ("cluster_balance_threshold", "density_exponent"): param = getattr(self, param_name) if isinstance(param, str) and param != "auto": raise ValueError( "'{}' should be 'auto' when a string is passed. " "Got {} instead.".format(param_name, repr(param)) ) self.cluster_balance_threshold_ = ( self.cluster_balance_threshold if self.kmeans_estimator_.n_clusters != 1 else -np.inf ) def _find_cluster_sparsity(self, X): """Compute the cluster sparsity.""" euclidean_distances = pairwise_distances( X, metric="euclidean", n_jobs=self.n_jobs ) # negate diagonal elements for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 non_diag_elements = (X.shape[0] ** 2) - X.shape[0] mean_distance = euclidean_distances.sum() / non_diag_elements exponent = ( math.log(X.shape[0], 1.6) ** 1.8 * 0.16 if self.density_exponent == "auto" else self.density_exponent ) return (mean_distance ** exponent) / X.shape[0] def _fit_resample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() total_inp_samples = sum(self.sampling_strategy_.values()) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue # target_class_indices = np.flatnonzero(y == class_sample) # X_class = _safe_indexing(X, target_class_indices) X_clusters = self.kmeans_estimator_.fit_predict(X) valid_clusters = [] cluster_sparsities = [] # identify cluster which are answering the requirements for cluster_idx in range(self.kmeans_estimator_.n_clusters): cluster_mask = np.flatnonzero(X_clusters == cluster_idx) X_cluster = _safe_indexing(X, cluster_mask) y_cluster = _safe_indexing(y, cluster_mask) cluster_class_mean = (y_cluster == class_sample).mean() if self.cluster_balance_threshold_ == "auto": balance_threshold = n_samples / total_inp_samples / 2 else: balance_threshold = self.cluster_balance_threshold_ # the cluster is already considered balanced if cluster_class_mean < balance_threshold: continue # not enough samples to apply SMOTE anticipated_samples = cluster_class_mean * X_cluster.shape[0] if anticipated_samples < self.nn_k_.n_neighbors: continue X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) valid_clusters.append(cluster_mask) cluster_sparsities.append( self._find_cluster_sparsity(X_cluster_class) ) cluster_sparsities = np.array(cluster_sparsities) cluster_weights = cluster_sparsities / cluster_sparsities.sum() if not valid_clusters: raise RuntimeError( "No clusters found with sufficient samples of " "class {}. Try lowering the cluster_balance_threshold " "or increasing the number of " "clusters.".format(class_sample) ) for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): X_cluster = _safe_indexing(X, valid_cluster) y_cluster = _safe_indexing(y, valid_cluster) X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) self.nn_k_.fit(X_cluster_class) nns = self.nn_k_.kneighbors( X_cluster_class, return_distance=False )[:, 1:] cluster_n_samples = int( math.ceil(n_samples * cluster_weights[valid_cluster_idx]) ) X_new, y_new = self._make_samples( X_cluster_class, y.dtype, class_sample, X_cluster_class, nns, cluster_n_samples, 1.0, ) stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled imbalanced-learn-0.7.0/imblearn/over_sampling/base.py000066400000000000000000000040401366766276300226420ustar00rootroot00000000000000""" Base class for the over-sampling method. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from ..base import BaseSampler class BaseOverSampler(BaseSampler): """Base class for over-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "over-sampling" _sampling_strategy_docstring = """sampling_strategy : float, str, dict or callable, default='auto' Sampling information to resample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the number of samples in the minority class after resampling and :math:`N_{M}` is the number of samples in the majority class. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'minority'``: resample only the minority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not majority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.strip() imbalanced-learn-0.7.0/imblearn/over_sampling/tests/000077500000000000000000000000001366766276300225225ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/over_sampling/tests/__init__.py000066400000000000000000000000001366766276300246210ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_adasyn.py000066400000000000000000000116441366766276300254200ustar00rootroot00000000000000"""Test the module under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import ADASYN RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_ada_init(): sampling_strategy = "auto" ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ada.random_state == RND_SEED def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ] ) y_gt = np.array( [ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_ada_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ] ) y_gt = np.array( [ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize( "adasyn_params, err_msg", [ ( {"sampling_strategy": {0: 9, 1: 12}}, "No samples will be generated.", ), ({"n_neighbors": "rnd"}, "has to be one of"), ], ) def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) with pytest.raises(ValueError, match=err_msg): adasyn.fit_resample(X, Y) imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_borderline_smote.py000066400000000000000000000036001366766276300274660ustar00rootroot00000000000000import pytest import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import BorderlineSMOTE @pytest.fixture def data(): X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_borderline_smote_wrong_kind(data): bsmote = BorderlineSMOTE(kind="rand") with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_resample(*data) @pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) def test_borderline_smote(kind, data): bsmote = BorderlineSMOTE(kind=kind, random_state=42) bsmote_nn = BorderlineSMOTE( kind=kind, random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), ) X_res_1, y_res_1 = bsmote.fit_resample(*data) X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_kmeans_smote.py000066400000000000000000000073771366766276300266360ustar00rootroot00000000000000import pytest import numpy as np from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.cluster import KMeans from sklearn.cluster import MiniBatchKMeans from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import KMeansSMOTE from imblearn.over_sampling import SMOTE @pytest.fixture def data(): X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_kmeans_smote(data): X, y = data kmeans_smote = KMeansSMOTE( kmeans_estimator=1, random_state=42, cluster_balance_threshold=0.0, k_neighbors=5, ) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y) X_res_2, y_res_2 = smote.fit_resample(X, y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) assert kmeans_smote.nn_k_.n_neighbors == 6 assert kmeans_smote.kmeans_estimator_.n_clusters == 1 assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params() @pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)]) @pytest.mark.parametrize( "kmeans_estimator", [ 3, KMeans(n_clusters=3, random_state=42), MiniBatchKMeans(n_clusters=3, random_state=42), ], ) def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data kmeans_smote = KMeansSMOTE( random_state=42, kmeans_estimator=kmeans_estimator, k_neighbors=k_neighbors, ) X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24,) assert kmeans_smote.nn_k_.n_neighbors == 3 assert kmeans_smote.kmeans_estimator_.n_clusters == 3 def test_sample_kmeans_not_enough_clusters(): rng = np.random.RandomState(42) X = rng.randn(30, 2) y = np.array([1] * 20 + [0] * 10) smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2) with pytest.raises(RuntimeError): smote.fit_resample(X, y) @pytest.mark.parametrize("density_exponent", ["auto", 2]) @pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.8]) def test_sample_kmeans_density_estimation( data, density_exponent, cluster_balance_threshold ): X, y = data smote = KMeansSMOTE( random_state=42, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) smote.fit_resample(X, y) @pytest.mark.parametrize( "density_exponent, cluster_balance_threshold", [("xxx", "auto"), ("auto", "xxx")], ) def test_kmeans_smote_param_error( data, density_exponent, cluster_balance_threshold ): X, y = data kmeans_smote = KMeansSMOTE( density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) with pytest.raises(ValueError, match="should be 'auto' when a string"): kmeans_smote.fit_resample(X, y) imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_random_over_sampler.py000066400000000000000000000104271366766276300301750ustar00rootroot00000000000000"""Test the module under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np import pytest from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import RandomOverSampler RND_SEED = 0 X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) def test_ros_init(): sampling_strategy = "auto" ros = RandomOverSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED ) assert ros.random_state == RND_SEED @pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array']) def test_ros_fit_resample(as_frame): if as_frame: pd = pytest.importorskip("pandas") X_ = pd.DataFrame(X) else: X_ = X ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X_, Y) X_gt = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323], ] ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) if as_frame: assert hasattr(X_resampled, "loc") X_resampled = X_resampled.to_numpy() assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ros_fit_resample_half(): sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5 def test_random_over_sampling_heterogeneous_data(): X_hetero = np.array( [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object ) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) X_res, y_res = ros.fit_resample(X_hetero, y) assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 assert X_res.dtype == object assert X_res[-1, 0] in X_hetero[:, 0] def test_random_over_sampling_nan_inf(): # check that we can oversample even with missing or infinite data # regression tests for #605 rng = np.random.RandomState(42) n_not_finite = X.shape[0] // 3 row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) col_indices = rng.randint(0, X.shape[1], size=n_not_finite) not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) X_ = X.copy() X_[row_indices, col_indices] = not_finite_values ros = RandomOverSampler(random_state=0) X_res, y_res = ros.fit_resample(X_, Y) assert y_res.shape == (14,) assert X_res.shape == (14, 2) assert np.any(~np.isfinite(X_res)) imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_smote.py000066400000000000000000000140631366766276300252660ustar00rootroot00000000000000"""Test the module SMOTE.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SVMSMOTE from imblearn.over_sampling import BorderlineSMOTE RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTE(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], ] ) y_gt = np.array( [ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTE(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], ] ) y_gt = np.array( [ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize( "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] ) def test_smote_m_neighbors(smote): # check that m_neighbors is properly set. Regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 _ = smote.fit_resample(X, Y) assert smote.nn_k_.n_neighbors == 6 assert smote.nn_m_.n_neighbors == 11 imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_smote_nc.py000066400000000000000000000206271366766276300257510ustar00rootroot00000000000000"""Test the module SMOTENC.""" # Authors: Guillaume Lemaitre # Christos Aridas # Dzianis Dudnik # License: MIT from collections import Counter import pytest import numpy as np from scipy import sparse from sklearn.datasets import make_classification from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import SMOTENC def data_heterogneous_ordered(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, :2] = rng.randn(30, 2) # create a categorical feature using some string X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [2, 3] def data_heterogneous_unordered(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [0, 3] def data_heterogneous_masked(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [True, False, True] def data_heterogneous_unordered_multiclass(): rng = np.random.RandomState(42) X = np.empty((50, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(50, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=50).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=50) y = np.array([0] * 10 + [1] * 15 + [2] * 25) # return the categories return X, y, [0, 3] def data_sparse(format): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=np.float64) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.randint(3, size=30) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) X = sparse.csr_matrix(X) if format == "csr" else sparse.csc_matrix(X) return X, y, [0, 3] def test_smotenc_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] smote = SMOTENC(random_state=0, categorical_features=categorical_features) with pytest.raises(ValueError, match="indices are out of range"): smote.fit_resample(X, y) @pytest.mark.parametrize( "data", [ data_heterogneous_ordered(), data_heterogneous_unordered(), data_heterogneous_masked(), data_sparse("csr"), data_sparse("csc"), ], ) def test_smotenc(data): X, y, categorical_features = data smote = SMOTENC(random_state=0, categorical_features=categorical_features) X_resampled, y_resampled = smote.fit_resample(X, y) assert X_resampled.dtype == X.dtype categorical_features = np.array(categorical_features) if categorical_features.dtype == bool: categorical_features = np.flatnonzero(categorical_features) for cat_idx in categorical_features: if sparse.issparse(X): assert set(X[:, cat_idx].data) == set(X_resampled[:, cat_idx].data) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype else: assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype # part of the common test which apply to SMOTE-NC even if it is not default # constructible def test_smotenc_check_target_type(): X, _, categorical_features = data_heterogneous_unordered() y = np.linspace(0, 1, 30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): smote.fit_resample(X, y) rng = np.random.RandomState(42) y = rng.randint(2, size=(20, 3)) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): smote.fit_resample(X, y) def test_smotenc_samplers_one_label(): X, _, categorical_features = data_heterogneous_unordered() y = np.zeros(30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="needs to have more than 1 class"): smote.fit(X, y) def test_smotenc_fit(): X, y, categorical_features = data_heterogneous_unordered() smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) assert hasattr( smote, "sampling_strategy_" ), "No fitted attribute sampling_strategy_" def test_smotenc_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() target_stats = Counter(y) smote = SMOTENC(categorical_features=categorical_features, random_state=0) _, y_res = smote.fit_resample(X, y) _ = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values()) def test_smotenc_fit_resample_sampling_strategy(): X, y, categorical_features = data_heterogneous_unordered_multiclass() expected_stat = Counter(y)[1] smote = SMOTENC(categorical_features=categorical_features, random_state=0) sampling_strategy = {2: 25, 0: 25} smote.set_params(sampling_strategy=sampling_strategy) X_res, y_res = smote.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat def test_smotenc_pandas(): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y, categorical_features = data_heterogneous_unordered_multiclass() X_pd = pd.DataFrame(X) smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) assert_array_equal(X_res_pd.to_numpy(), X_res) assert_allclose(y_res_pd, y_res) def test_smotenc_preserve_dtype(): X, y = make_classification( n_samples=50, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) smote = SMOTENC(categorical_features=[1], random_state=0) X_res, y_res = smote.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" @pytest.mark.parametrize( "categorical_features", [[True, True, True], [0, 1, 2]] ) def test_smotenc_raising_error_all_categorical(categorical_features): X, y = make_classification( n_features=3, n_informative=1, n_redundant=1, n_repeated=0, n_clusters_per_class=1, ) smote = SMOTENC(categorical_features=categorical_features) err_msg = "SMOTE-NC is not designed to work only with categorical features" with pytest.raises(ValueError, match=err_msg): smote.fit_resample(X, y) def test_smote_nc_with_null_median_std(): # Non-regression test for #662 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/662 data = np.array([[1, 2, 1, 'A'], [2, 1, 2, 'A'], [1, 2, 3, 'B'], [1, 2, 4, 'C'], [1, 2, 5, 'C']], dtype="object") labels = np.array( ['class_1', 'class_1', 'class_1', 'class_2', 'class_2'], dtype=object ) smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0) X_res, y_res = smote.fit_resample(data, labels) # check that the categorical feature is not random but correspond to the # categories seen in the minority class samples assert X_res[-1, -1] == "C" imbalanced-learn-0.7.0/imblearn/over_sampling/tests/test_svm_smote.py000066400000000000000000000032301366766276300261450ustar00rootroot00000000000000import pytest import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import SVMSMOTE @pytest.fixture def data(): X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_svm_smote(data): svm_smote = SVMSMOTE(random_state=42) svm_smote_nn = SVMSMOTE( random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(gamma="scale", random_state=42), ) X_res_1, y_res_1 = svm_smote.fit_resample(*data) X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) imbalanced-learn-0.7.0/imblearn/pipeline.py000066400000000000000000000432151366766276300206770ustar00rootroot00000000000000""" The :mod:`imblearn.pipeline` module implements utilities to build a composite estimator, as a chain of transforms, samples and estimators. """ # Adapted from scikit-learn # Author: Edouard Duchesnay # Gael Varoquaux # Virgile Fritsch # Alexandre Gramfort # Lars Buitinck # Christos Aridas # Guillaume Lemaitre # License: BSD from sklearn import pipeline from sklearn.base import clone from sklearn.utils import _print_elapsed_time from sklearn.utils.metaestimators import if_delegate_has_method from sklearn.utils.validation import check_memory __all__ = ["Pipeline", "make_pipeline"] class Pipeline(pipeline.Pipeline): """Pipeline of transforms and resamples with a final estimator. Sequentially apply a list of transforms, sampling, and a final estimator. Intermediate steps of the pipeline must be transformers or resamplers, that is, they must implement fit, transform and sample methods. The samplers are only applied during fit. The final estimator only needs to implement fit. The transformers and samplers in the pipeline can be cached using ``memory`` argument. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by a '__', as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to 'passthrough' or ``None``. Parameters ---------- steps : list List of (name, transform) tuples (implementing fit/transform/fit_resample) that are chained, in the order in which they are chained, with the last object an estimator. memory : Instance of joblib.Memory or str, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. Attributes ---------- named_steps : bunch object, a dictionary with attribute access Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. See Also -------- make_pipeline : Helper function to make pipeline. Notes ----- See :ref:`sphx_glr_auto_examples_pipeline_plot_pipeline_classification.py` Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split as tts >>> from sklearn.decomposition import PCA >>> from sklearn.neighbors import KNeighborsClassifier as KNN >>> from sklearn.metrics import classification_report >>> from imblearn.over_sampling import SMOTE >>> from imblearn.pipeline import Pipeline # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape {}'.format(Counter(y))) Original dataset shape Counter({1: 900, 0: 100}) >>> pca = PCA() >>> smt = SMOTE(random_state=42) >>> knn = KNN() >>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)]) >>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42) >>> pipeline.fit(X_train, y_train) # doctest: +ELLIPSIS Pipeline(...) >>> y_hat = pipeline.predict(X_test) >>> print(classification_report(y_test, y_hat)) precision recall f1-score support 0 0.87 1.00 0.93 26 1 1.00 0.98 0.99 224 accuracy 0.98 250 macro avg 0.93 0.99 0.96 250 weighted avg 0.99 0.98 0.98 250 """ # BaseEstimator interface def _validate_steps(self): names, estimators = zip(*self.steps) # validate names self._validate_names(names) # validate estimators transformers = estimators[:-1] estimator = estimators[-1] for t in transformers: if t is None or t == "passthrough": continue if not ( hasattr(t, "fit") or hasattr(t, "fit_transform") or hasattr(t, "fit_resample") ) or not (hasattr(t, "transform") or hasattr(t, "fit_resample")): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or " "fit_resample (but not both) or be a string 'passthrough' " "'%s' (type %s) doesn't)" % (t, type(t)) ) if hasattr(t, "fit_resample") and ( hasattr(t, "fit_transform") or hasattr(t, "transform") ): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or " "fit_resample." " '%s' implements both)" % (t) ) if isinstance(t, pipeline.Pipeline): raise TypeError( "All intermediate steps of the chain should not be" " Pipelines" ) # We allow last estimator to be None as an identity transformation if ( estimator is not None and estimator != "passthrough" and not hasattr(estimator, "fit") ): raise TypeError( "Last step of Pipeline should implement fit or be " "the string 'passthrough'. '%s' (type %s) doesn't" % (estimator, type(estimator)) ) def _iter( self, with_final=True, filter_passthrough=True, filter_resample=True ): """Generate (idx, (name, trans)) tuples from self.steps. When `filter_passthrough` is `True`, 'passthrough' and None transformers are filtered out. When `filter_resample` is `True`, estimator with a method `fit_resample` are filtered out. """ it = super()._iter(with_final, filter_passthrough) if filter_resample: return filter(lambda x: not hasattr(x[-1], "fit_resample"), it) else: return it # Estimator interface def _fit(self, X, y=None, **fit_params): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = { name: {} for name, step in self.steps if step is not None } for pname, pval in fit_params.items(): if '__' not in pname: raise ValueError( "Pipeline.fit does not accept the {} parameter. " "You can pass parameters to specific steps of your " "pipeline using the stepname__parameter format, e.g. " "`Pipeline.fit(X, y, logisticregression__sample_weight" "=sample_weight)`.".format(pname)) step, param = pname.split("__", 1) fit_params_steps[step][param] = pval for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False, filter_resample=False): if (transformer is None or transformer == 'passthrough'): with _print_elapsed_time('Pipeline', self._log_message(step_idx)): continue try: # joblib >= 0.12 mem = memory.location except AttributeError: mem = memory.cachedir finally: cloned_transformer = clone(transformer) if mem else transformer # Fit or load from cache the current transfomer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform" ): X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name] ) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( cloned_transformer, X, y, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name] ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == "passthrough": return X, y, {} return X, y, fit_params_steps[self.steps[-1][0]] def fit(self, X, y=None, **fit_params): """Fit the model. Fit all the transforms/samplers one after the other and transform/sample the data, then fit the transformed/sampled data using the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of str -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- self : Pipeline This estimator. """ Xt, yt, fit_params = self._fit(X, y, **fit_params) with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": self._final_estimator.fit(Xt, yt, **fit_params) return self def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator. Fits all the transformers/samplers one after the other and transform/sample the data, then uses fit_transform on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples. """ last_step = self._final_estimator Xt, yt, fit_params = self._fit(X, y, **fit_params) with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt elif hasattr(last_step, "fit_transform"): return last_step.fit_transform(Xt, yt, **fit_params) else: return last_step.fit(Xt, yt, **fit_params).transform(Xt) def fit_resample(self, X, y=None, **fit_params): """Fit the model and sample with the final estimator. Fits all the transformers/samplers one after the other and transform/sample the data, then uses fit_resample on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples. yt : array-like of shape (n_samples, n_transformed_features) Transformed target. """ last_step = self._final_estimator Xt, yt, fit_params = self._fit(X, y, **fit_params) with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt elif hasattr(last_step, "fit_resample"): return last_step.fit_resample(Xt, yt, **fit_params) @if_delegate_has_method(delegate="_final_estimator") def fit_predict(self, X, y=None, **fit_params): """Apply `fit_predict` of last step in pipeline after transforms. Applies fit_transforms of a pipeline to the data, followed by the fit_predict method of the final estimator in the pipeline. Valid only if the final estimator implements fit_predict. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- y_pred : ndarray of shape (n_samples,) The predicted target. """ Xt, yt, fit_params = self._fit(X, y, **fit_params) with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)): y_pred = self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) return y_pred def _fit_resample_one(sampler, X, y, message_clsname='', message=None, **fit_params): with _print_elapsed_time(message_clsname, message): X_res, y_res = sampler.fit_resample(X, y, **fit_params) return X_res, y_res, sampler def make_pipeline(*steps, **kwargs): """Construct a Pipeline from the given estimators. This is a shorthand for the Pipeline constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the lowercase of their types automatically. Parameters ---------- *steps : list of estimators A list of estimators. memory : None, str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. Returns ------- p : Pipeline See Also -------- imblearn.pipeline.Pipeline : Class for creating a pipeline of transforms with a final estimator. Examples -------- >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.preprocessing import StandardScaler >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) ... # doctest: +NORMALIZE_WHITESPACE Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ memory = kwargs.pop("memory", None) verbose = kwargs.pop('verbose', False) if kwargs: raise TypeError( 'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0]) ) return Pipeline( pipeline._name_estimators(steps), memory=memory, verbose=verbose ) imbalanced-learn-0.7.0/imblearn/tensorflow/000077500000000000000000000000001366766276300207155ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/tensorflow/__init__.py000066400000000000000000000003011366766276300230200ustar00rootroot00000000000000"""The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced dataset in tensorflow.""" from ._generator import balanced_batch_generator __all__ = ["balanced_batch_generator"] imbalanced-learn-0.7.0/imblearn/tensorflow/_generator.py000066400000000000000000000065431366766276300234240ustar00rootroot00000000000000"""Implement generators for ``tensorflow`` which will balance the data.""" from scipy.sparse import issparse from sklearn.base import clone from sklearn.utils import _safe_indexing from sklearn.utils import check_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring from ..utils._validation import _deprecate_positional_args @Substitution(random_state=_random_state_docstring) @_deprecate_positional_args def balanced_batch_generator( X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): """Create a balanced batch generator to train tensorflow model. Returns a generator --- as well as the number of step per epoch --- which is given to ``fit_generator``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. Parameters ---------- X : ndarray, shape (n_samples, n_features) Original imbalanced dataset. y : ndarray, shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray, shape (n_samples,) Sample weight. sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. keep_sparse : bool, optional (default=False) Either or not to conserve or not the sparsity of the input ``X``. By default, the returned batches will be dense. {random_state} Returns ------- generator : generator of tuple Generate batch of data. The tuple generated are either (X_batch, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int The number of samples per epoch. """ random_state = check_random_state(random_state) if sampler is None: sampler_ = RandomUnderSampler(random_state=random_state) else: sampler_ = clone(sampler) sampler_.fit_resample(X, y) if not hasattr(sampler_, "sample_indices_"): raise ValueError( "'sampler' needs to have an attribute " "'sample_indices_'." ) indices = sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(indices) def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): X_res = _safe_indexing(X, indices[index:index + batch_size]) y_res = _safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = _safe_indexing( sample_weight, indices[index:index + batch_size] ) yield X_res, y_res, sw_res return ( generator(X, y, sample_weight, indices, batch_size), int(indices.size // batch_size), ) imbalanced-learn-0.7.0/imblearn/tensorflow/tests/000077500000000000000000000000001366766276300220575ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/tensorflow/tests/test_generator.py000066400000000000000000000126521366766276300254640ustar00rootroot00000000000000from distutils.version import LooseVersion import pytest import numpy as np from scipy import sparse from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss from imblearn.over_sampling import RandomOverSampler from imblearn.tensorflow import balanced_batch_generator tf = pytest.importorskip("tensorflow") @pytest.fixture def data(): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) X = X.astype(np.float32) return X, y def check_balanced_batch_generator_tf_1_X_X(dataset, sampler): X, y = dataset batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=None, sampler=sampler, batch_size=batch_size, random_state=42, ) learning_rate = 0.01 epochs = 10 input_size = X.shape[1] output_size = 3 # helper functions def init_weights(shape): return tf.Variable(tf.random_normal(shape, stddev=0.01)) def accuracy(y_true, y_pred): return np.mean(np.argmax(y_pred, axis=1) == y_true) # input and output data = tf.placeholder("float32", shape=[None, input_size]) targets = tf.placeholder("int32", shape=[None]) # build the model and weights W = init_weights([input_size, output_size]) b = init_weights([output_size]) out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out_act, labels=targets ) loss = tf.reduce_sum(cross_entropy) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) predict = tf.nn.softmax(out_act) # Initialization of all variables in the graph init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) sess.run( [train_op, loss], feed_dict={data: X_batch, targets: y_batch}, ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print( "epoch: {} train accuracy: {:.3f}".format( e, accuracy(y, predicts_train) ) ) def check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(dataset, sampler): tf.compat.v1.disable_eager_execution() X, y = dataset batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=None, sampler=sampler, batch_size=batch_size, random_state=42, ) learning_rate = 0.01 epochs = 10 input_size = X.shape[1] output_size = 3 # helper functions def init_weights(shape): return tf.Variable(tf.random.normal(shape, stddev=0.01)) def accuracy(y_true, y_pred): return np.mean(np.argmax(y_pred, axis=1) == y_true) # input and output data = tf.compat.v1.placeholder("float32", shape=[None, input_size]) targets = tf.compat.v1.placeholder("int32", shape=[None]) # build the model and weights W = init_weights([input_size, output_size]) b = init_weights([output_size]) out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out_act, labels=targets ) loss = tf.reduce_sum(input_tensor=cross_entropy) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) predict = tf.nn.softmax(out_act) # Initialization of all variables in the graph init = tf.compat.v1.global_variables_initializer() with tf.compat.v1.Session() as sess: sess.run(init) for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) sess.run( [train_op, loss], feed_dict={data: X_batch, targets: y_batch}, ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print( "epoch: {} train accuracy: {:.3f}".format( e, accuracy(y, predicts_train) ) ) @pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) def test_balanced_batch_generator(data, sampler): if LooseVersion(tf.__version__) < '2': check_balanced_batch_generator_tf_1_X_X(data, sampler) else: check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(data, sampler) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42, ) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) imbalanced-learn-0.7.0/imblearn/tests/000077500000000000000000000000001366766276300176555ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/tests/__init__.py000066400000000000000000000000001366766276300217540ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/tests/test_base.py000066400000000000000000000057271366766276300222130ustar00rootroot00000000000000"""Test for miscellaneous samplers objects.""" # Authors: Guillaume Lemaitre # License: MIT import pytest import numpy as np from scipy import sparse from sklearn.datasets import load_iris from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression from sklearn.utils import _safe_indexing from sklearn.utils.multiclass import type_of_target from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose_dense_sparse from imblearn.datasets import make_imbalance from imblearn.pipeline import make_pipeline from imblearn.under_sampling import RandomUnderSampler from imblearn import FunctionSampler iris = load_iris() X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0 ) def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) with pytest.raises( TypeError, match="A sparse matrix was passed, " "but dense data is required", ): sampler.fit_resample(X_sparse, y) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_identity(X, y): sampler = FunctionSampler() X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X) assert_array_equal(y_res, y) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_func(X, y): def func(X, y): return X[:10], y[:10] sampler = FunctionSampler(func=func) X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X[:10]) assert_array_equal(y_res, y[:10]) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state ) return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} ) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2) def test_function_sampler_validate(): # check that we can let a pass a regression variable by turning down the # validation X, y = make_regression() def dummy_sampler(X, y): indices = np.random.choice(np.arange(X.shape[0]), size=100) return _safe_indexing(X, indices), _safe_indexing(y, indices) sampler = FunctionSampler(func=dummy_sampler, validate=False) pipeline = make_pipeline(sampler, LinearRegression()) y_pred = pipeline.fit(X, y).predict(X) assert type_of_target(y_pred) == 'continuous' imbalanced-learn-0.7.0/imblearn/tests/test_common.py000066400000000000000000000043001366766276300225530ustar00rootroot00000000000000"""Common tests""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning from sklearn.utils.estimator_checks import parametrize_with_checks as \ parametrize_with_checks_sklearn from sklearn.utils.estimator_checks import _construct_instance from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import set_random_state from sklearn.utils._testing import SkipTest from imblearn.utils.estimator_checks import parametrize_with_checks from imblearn.utils.estimator_checks import _set_checking_parameters from imblearn.utils.estimator_checks import _yield_all_checks from imblearn.utils.testing import all_estimators from imblearn.under_sampling import NearMiss @pytest.mark.parametrize("name, Estimator", all_estimators()) def test_all_estimator_no_base_class(name, Estimator): # test that all_estimators doesn't find abstract classes. msg = ( f"Base estimators such as {name} should not be included" f" in all_estimators" ) assert not name.lower().startswith("base"), msg def _tested_estimators(): for name, Estimator in all_estimators(): try: estimator = _construct_instance(Estimator) set_random_state(estimator) except SkipTest: continue if isinstance(estimator, NearMiss): # For NearMiss, let's check the three algorithms for version in (1, 2, 3): yield clone(estimator).set_params(version=version) else: yield estimator @parametrize_with_checks_sklearn(list(_tested_estimators())) def test_estimators_compatibility_sklearn(estimator, check, request): _set_checking_parameters(estimator) check(estimator) @parametrize_with_checks(list(_tested_estimators())) def test_estimators_imblearn(estimator, check, request): # Common tests for estimator instances with ignore_warnings(category=(FutureWarning, ConvergenceWarning, UserWarning, FutureWarning)): _set_checking_parameters(estimator) check(estimator) imbalanced-learn-0.7.0/imblearn/tests/test_exceptions.py000066400000000000000000000005671366766276300234570ustar00rootroot00000000000000"""Test for the exceptions modules""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from pytest import raises from imblearn.exceptions import raise_isinstance_error def test_raise_isinstance_error(): var = 10.0 with raises(ValueError, match="has to be one of"): raise_isinstance_error("var", [int], var) imbalanced-learn-0.7.0/imblearn/tests/test_pipeline.py000066400000000000000000001245761366766276300231120ustar00rootroot00000000000000""" Test the pipeline module. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import itertools import re import shutil import time from tempfile import mkdtemp import numpy as np import pytest from pytest import raises from joblib import Memory from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.base import clone, BaseEstimator from sklearn.svm import SVC from sklearn.neighbors import LocalOutlierFactor from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LinearRegression from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.datasets import load_iris, make_classification from sklearn.preprocessing import StandardScaler from sklearn.pipeline import FeatureUnion from imblearn.datasets import make_imbalance from imblearn.pipeline import Pipeline, make_pipeline from imblearn.under_sampling import RandomUnderSampler from imblearn.under_sampling import EditedNearestNeighbours as ENN JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) R_TOL = 1e-4 class NoFit: """Small class to test parameter dispatching. """ def __init__(self, a=None, b=None): self.a = a self.b = b class NoTrans(NoFit): def fit(self, X, y): return self def get_params(self, deep=False): return {"a": self.a, "b": self.b} def set_params(self, **params): self.a = params["a"] return self class NoInvTransf(NoTrans): def transform(self, X, y=None): return X class Transf(NoInvTransf): def transform(self, X, y=None): return X def inverse_transform(self, X): return X class TransfFitParams(Transf): def fit(self, X, y, **fit_params): self.fit_params = fit_params return self class Mult(BaseEstimator): def __init__(self, mult=1): self.mult = mult def fit(self, X, y): return self def transform(self, X): return np.asarray(X) * self.mult def inverse_transform(self, X): return np.asarray(X) / self.mult def predict(self, X): return (np.asarray(X) * self.mult).sum(axis=1) predict_proba = predict_log_proba = decision_function = predict def score(self, X, y=None): return np.sum(X) class FitParamT(BaseEstimator): """Mock classifier """ def __init__(self): self.successful = False def fit(self, X, y, should_succeed=False): self.successful = should_succeed def predict(self, X): return self.successful def fit_predict(self, X, y, should_succeed=False): self.fit(X, y, should_succeed=should_succeed) return self.predict(X) def score(self, X, y=None, sample_weight=None): if sample_weight is not None: X = X * sample_weight return np.sum(X) class DummyTransf(Transf): """Transformer which store the column means""" def fit(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() return self class DummyEstimatorParams(BaseEstimator): """Mock classifier that takes params on predict""" def fit(self, X, y): return self def predict(self, X, got_attribute=False): self.got_attribute = got_attribute return self class DummySampler(NoTrans): """Samplers which returns a balanced number of samples""" def fit_resample(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() return X, y class FitTransformSample(NoTrans): """Estimator implementing both transform and sample """ def fit(self, X, y, should_succeed=False): pass def fit_resample(self, X, y=None): return X, y def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) def transform(self, X, y=None): return X def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X) def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method error_regex = ( "Last step of Pipeline should implement fit or be the " "string 'passthrough'" ) with raises(TypeError, match=error_regex): Pipeline([("clf", NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([("svc", clf)]) expected = dict( svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) ) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC(gamma="scale") filter1 = SelectKBest(f_classif) pipe = Pipeline([("anova", filter1), ("svc", clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = "implement fit and transform or fit_resample" with raises(TypeError, match=error_regex): Pipeline([("t", NoTrans()), ("svc", clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps["svc"] is pipe2.named_steps["svc"] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop("svc") params.pop("anova") params2.pop("svc") params2.pop("anova") assert params == params2 def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps["transf"].a is None assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True) def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8 def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3])) def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([("cls", LinearRegression())]) with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake="nope") # nested model check with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake__estimator="nope") def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y) def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline( [("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)] ) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred) def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver="full") pipe = Pipeline([("scaler", scaler), ("pca", pca)]) error_regex = "'PCA' object has no attribute 'fit_predict'" with raises(AttributeError, match=error_regex): getattr(pipe, "fit_predict") def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) pipe.fit_predict( X=None, y=None, transf__should_get_this=True, clf__should_succeed=True ) assert pipe.named_steps["transf"].fit_params["should_get_this"] assert pipe.named_steps["clf"].successful assert "should_succeed" not in pipe.named_steps["transf"].fit_params def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver="full") pipeline = Pipeline([("pca", pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2) def test_pipeline_fit_transform(): # Test whether pipeline works with a transformer missing fit_transform iris = load_iris() X = iris.data y = iris.target transf = Transf() pipeline = Pipeline([("mock", transf)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) X_trans2 = transf.fit(X, y).transform(X) assert_array_almost_equal(X_trans, X_trans2) def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1]) @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline( [("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)] ) pipeline.fit(X, y) expected_names = ["m2", "bad", "m3", "m5"] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_set_pipeline_step_passthrough(passthrough): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = { "steps": pipeline.steps, "m2": mult2, "m3": passthrough, "last": mult5, "memory": None, "m2__mult": 2, "last__mult": 5, "verbose": False, } assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ "predict_proba", "predict_log_proba", "decision_function", "transform", "score", ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, "predict") # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline("passthrough") assert pipeline.steps[0] == ("passthrough", "passthrough") assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt" def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) with raises(AttributeError): getattr(reg, "classes_") clf = make_pipeline( SelectKBest(k=1), LogisticRegression(solver="lbfgs", multi_class="auto", random_state=0), ) with raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline( [("transf", DummyTransf()), ("svc", SVC(gamma="scale"))], memory=memory ) error_regex = "string or have the same interface as" with raises(ValueError, match=error_regex): cached_pipe.fit(X, y) def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline( [("transf", transf), ("svc", clf)], memory=memory ) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal( pipe.predict_proba(X), cached_pipe_2.predict_proba(X) ) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline( [("transf", transf), ("svc", clf)], memory=memory ) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal( pipe.predict_proba(X), cached_pipe_2.predict_proba(X) ) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) def test_pipeline_methods_pca_rus_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("pca", pca), ("rus", rus), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([("rus", rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() pipeline = Pipeline([("pca", PCA()), ("rus", rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) def test_pipeline_sample_transform(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() pipeline = Pipeline([("pca", pca), ("rus", rus), ("pca2", pca2)]) pipeline.fit(X, y).transform(X) def test_pipeline_none_classifier(): # Test pipeline using None as preprocessing step and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) pipe = make_pipeline(None, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_none_sampler_classifier(): # Test pipeline using None, RUS and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_sampler_none_classifier(): # Test pipeline using RUS, None and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(rus, None, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_none_sampler_sample(): # Test pipeline using None step and a sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus) pipe.fit_resample(X, y) def test_pipeline_none_transformer(): # Test pipeline using None and a transformer that implements transform and # inverse_transform X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) pca = PCA(whiten=True) pipe = make_pipeline(None, pca) pipe.fit(X, y) X_trans = pipe.transform(X) X_inversed = pipe.inverse_transform(X_trans) assert_array_almost_equal(X, X_inversed) def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs") with raises(TypeError): Pipeline([("step", FitTransformSample()), ("logistic", clf)]) def test_pipeline_with_step_that_it_is_pipeline(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe1 = Pipeline([("rus", rus), ("anova", filter1)]) with raises(TypeError): Pipeline([("pipe1", pipe1), ("logistic", clf)]) def test_pipeline_fit_then_sample_with_sampler_last_estimator(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn) X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample( X, y ) pipeline = make_pipeline(rus, enn) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample( X, y ) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_make_pipeline_memory(): cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) pipeline = make_pipeline( DummyTransf(), SVC(gamma="scale"), memory=memory ) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC(gamma="scale")) assert pipeline.memory is None finally: shutil.rmtree(cachedir) def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps["clf"].got_attribute def test_resampler_last_stage_passthrough(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) pipe = make_pipeline(rus, None) pipe.fit_resample(X, y) def test_pipeline_score_samples_pca_lof(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0, ) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler(random_state=42) pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values X_res, _ = rus.fit_resample(X, y) lof.fit(pca.fit_transform(X_res)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) with pytest.raises( AttributeError, match="'LogisticRegression' object has no attribute " "'score_samples'", ): pipe.score_samples(X) def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " "the sample_weight parameter"): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) parameter_grid_test_verbose = ((est, pattern, method) for (est, pattern), method in itertools.product( [ (Pipeline([('transf', Transf()), ('clf', FitParamT())]), r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'), (Pipeline([('transf', Transf()), ('noop', None), ('clf', FitParamT())]), r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n' r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'), (Pipeline([('transf', Transf()), ('noop', 'passthrough'), ('clf', FitParamT())]), r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n' r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'), (Pipeline([('transf', Transf()), ('clf', None)]), r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'), (Pipeline([('transf', None), ('mult', Mult())]), r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'), (Pipeline([('transf', 'passthrough'), ('mult', Mult())]), r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'), (FeatureUnion([('mult1', Mult()), ('mult2', Mult())]), r'\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n' r'\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$'), (FeatureUnion([('mult1', 'drop'), ('mult2', Mult()), ('mult3', 'drop')]), r'\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$') ], ['fit', 'fit_transform', 'fit_predict']) if hasattr(est, method) and not ( method == 'fit_transform' and hasattr(est, 'steps') and isinstance(est.steps[-1][1], FitParamT)) ) @pytest.mark.parametrize('est, pattern, method', parameter_grid_test_verbose) def test_verbose(est, method, pattern, capsys): func = getattr(est, method) X = [[1, 2, 3], [4, 5, 6]] y = [[7], [8]] est.set_params(verbose=False) func(X, y) assert not capsys.readouterr().out, 'Got output for verbose=False' est.set_params(verbose=True) func(X, y) assert re.match(pattern, capsys.readouterr().out) def test_pipeline_score_samples_pca_lof(): X, y = load_iris(return_X_y=True) sampling_strategy = {0: 50, 1: 30, 2: 20} X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler() pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('rus', rus), ('pca', pca), ('lof', lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) def test_pipeline_old_joblib_memory(monkeypatch): """Test that Pipeline works with old versions of joblib""" monkeypatch.setattr(Memory, "cachedir", "foo", raising=False) monkeypatch.setattr(Memory, "cache", lambda self, x: x, raising=False) memory = Memory() del memory.location # Older versions do not have the location parameter iris = load_iris() X = iris.data y = iris.target cached_pipe = Pipeline( [("transf", DummyTransf()), ("svc", SVC(gamma="scale"))], memory=memory ) cached_pipe.fit(X, y) imbalanced-learn-0.7.0/imblearn/under_sampling/000077500000000000000000000000001366766276300215225ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/__init__.py000066400000000000000000000017321366766276300236360ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling` provides methods to under-sample a dataset. """ from ._prototype_generation import ClusterCentroids from ._prototype_selection import RandomUnderSampler from ._prototype_selection import TomekLinks from ._prototype_selection import NearMiss from ._prototype_selection import CondensedNearestNeighbour from ._prototype_selection import OneSidedSelection from ._prototype_selection import NeighbourhoodCleaningRule from ._prototype_selection import EditedNearestNeighbours from ._prototype_selection import RepeatedEditedNearestNeighbours from ._prototype_selection import AllKNN from ._prototype_selection import InstanceHardnessThreshold __all__ = [ "ClusterCentroids", "RandomUnderSampler", "InstanceHardnessThreshold", "NearMiss", "TomekLinks", "EditedNearestNeighbours", "RepeatedEditedNearestNeighbours", "AllKNN", "OneSidedSelection", "CondensedNearestNeighbour", "NeighbourhoodCleaningRule", ] imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/000077500000000000000000000000001366766276300261415ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/__init__.py000066400000000000000000000003501366766276300302500ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling.prototype_generation` submodule contains methods that generate new samples in order to balance the dataset. """ from ._cluster_centroids import ClusterCentroids __all__ = ["ClusterCentroids"] imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py000066400000000000000000000147311366766276300324130ustar00rootroot00000000000000"""Class to perform under-sampling by generating centroids based on clustering.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # License: MIT import warnings import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._docstring import _random_state_docstring from ...utils._validation import _deprecate_positional_args VOTING_KIND = ("auto", "hard", "soft") @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class ClusterCentroids(BaseUnderSampler): """Undersample by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} estimator : object, default=KMeans() Pass a :class:`sklearn.cluster.KMeans` estimator. voting : {{"hard", "soft", "auto"}}, default='auto' Voting strategy to generate the new samples: - If ``'hard'``, the nearest-neighbors of the centroids found using the clustering algorithm will be used. - If ``'soft'``, the centroids found by the clustering algorithm will be used. - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` otherwise, ``'soft'`` will be used. .. versionadded:: 0.3.0 {n_jobs} .. deprecated:: 0.7 `n_jobs` was deprecated in 0.7 and will be removed in 0.9. See Also -------- EditedNearestNeighbours : Under-sampling by editing samples. CondensedNearestNeighbour: Under-sampling by condensing samples. Notes ----- Supports multi-class resampling by sampling each class independently. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) ... # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, estimator=None, voting="auto", n_jobs="deprecated", ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.voting = voting self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.n_jobs != "deprecated": warnings.warn( "'n_jobs' was deprecated in 0.7 and will be removed in 0.9", FutureWarning ) if self.estimator is None: self.estimator_ = KMeans(random_state=self.random_state) elif isinstance(self.estimator, KMeans): self.estimator_ = clone(self.estimator) else: raise ValueError( "`estimator` has to be a KMeans clustering." " Got {} instead.".format(type(self.estimator)) ) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == "hard": nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors( centroids, return_distance=False ) X_new = _safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) return X_new, y_new def _fit_resample(self, X, y): self._validate_estimator() if self.voting == "auto": if sparse.issparse(X): self.voting_ = "hard" else: self.voting_ = "soft" else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError( "'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting) ) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class ) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(_safe_indexing(X, target_class_indices)) y_resampled.append(_safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled, dtype=y.dtype) def _more_tags(self): return {"sample_indices": False} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/tests/000077500000000000000000000000001366766276300273035ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/tests/__init__.py000066400000000000000000000000001366766276300314020ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py000066400000000000000000000065031366766276300344530ustar00rootroot00000000000000"""Test the module cluster centroids.""" from collections import Counter import pytest import numpy as np from scipy import sparse from sklearn.cluster import KMeans from imblearn.under_sampling import ClusterCentroids RND_SEED = 0 X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) R_TOL = 1e-4 @pytest.mark.parametrize( "X, expected_voting", [(X, "soft"), (sparse.csr_matrix(X), "hard")] ) def test_fit_resample_check_voting(X, expected_voting): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == expected_voting def test_fit_resample_auto(): sampling_strategy = "auto" cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (9, 2) assert y_resampled.shape == (9,) def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) _, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 def test_fit_resample_object(): sampling_strategy = "auto" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) def test_fit_hard_voting(): sampling_strategy = "auto" voting = "hard" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) for x in X_resampled: assert np.any(np.all(x == X, axis=1)) @pytest.mark.parametrize( "cluster_centroids_params, err_msg", [ ({"estimator": "rnd"}, "has to be a KMeans clustering"), ({"voting": "unknown"}, "needs to be one of"), ], ) def test_fit_resample_error(cluster_centroids_params, err_msg): cc = ClusterCentroids(**cluster_centroids_params) with pytest.raises(ValueError, match=err_msg): cc.fit_resample(X, Y) def test_cluster_centroids_n_jobs(): # check that we deprecate the `n_jobs` parameter. cc = ClusterCentroids(n_jobs=1) with pytest.warns(FutureWarning) as record: cc.fit_resample(X, Y) assert len(record) == 1 assert "'n_jobs' was deprecated" in record[0].message.args[0] imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/000077500000000000000000000000001366766276300257735ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/__init__.py000066400000000000000000000017351366766276300301120ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling.prototype_selection` submodule contains methods that select samples in order to balance the dataset. """ from ._random_under_sampler import RandomUnderSampler from ._tomek_links import TomekLinks from ._nearmiss import NearMiss from ._condensed_nearest_neighbour import CondensedNearestNeighbour from ._one_sided_selection import OneSidedSelection from ._neighbourhood_cleaning_rule import NeighbourhoodCleaningRule from ._edited_nearest_neighbours import EditedNearestNeighbours from ._edited_nearest_neighbours import RepeatedEditedNearestNeighbours from ._edited_nearest_neighbours import AllKNN from ._instance_hardness_threshold import InstanceHardnessThreshold __all__ = [ "RandomUnderSampler", "InstanceHardnessThreshold", "NearMiss", "TomekLinks", "EditedNearestNeighbours", "RepeatedEditedNearestNeighbours", "AllKNN", "OneSidedSelection", "CondensedNearestNeighbour", "NeighbourhoodCleaningRule", ] imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py000066400000000000000000000172711366766276300342410ustar00rootroot00000000000000"""Class to perform under-sampling based on the condensed nearest neighbour method.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np from scipy.sparse import issparse from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, _safe_indexing from ..base import BaseCleaningSampler from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._docstring import _random_state_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class CondensedNearestNeighbour(BaseCleaningSampler): """Undersample based on the condensed nearest neighbour method. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int or object, default=\ KNeighborsClassifier(n_neighbors=1) If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. n_seeds_S : int, default=1 Number of samples to extract in order to build the set S. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- EditedNearestNeighbours : Undersample by editing samples. RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. AllKNN : Undersample using ENN and various number of neighbours. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] P. Hart, "The condensed nearest neighbor rule," In Information Theory, IEEE Transactions on, vol. 14(3), pp. 515-516, 1968. Examples -------- >>> from collections import Counter # doctest: +SKIP >>> from sklearn.datasets import fetch_mldata # doctest: +SKIP >>> from imblearn.under_sampling import \ CondensedNearestNeighbour # doctest: +SKIP >>> pima = fetch_mldata('diabetes_scale') # doctest: +SKIP >>> X, y = pima['data'], pima['target'] # doctest: +SKIP >>> print('Original dataset shape %s' % Counter(y)) # doctest: +SKIP Original dataset shape Counter({{1: 500, -1: 268}}) # doctest: +SKIP >>> cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP >>> X_res, y_res = cnn.fit_resample(X, y) #doctest: +SKIP >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +SKIP Resampled dataset shape Counter({{-1: 268, 1: 227}}) # doctest: +SKIP """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( n_neighbors=1, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, int): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) else: raise ValueError( "`n_neighbors` has to be a int or an object" " inhereited from KNeighborsClassifier." " Got {} instead.".format(type(self.n_neighbors)) ) def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # Randomly get one sample from the majority class # Generate the index to select idx_maj = np.flatnonzero(y == target_class) idx_maj_sample = idx_maj[ random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S, ) ] # Create the set C - One majority samples and all minority C_indices = np.append( np.flatnonzero(y == class_minority), idx_maj_sample ) C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # Create the set S - all majority samples S_indices = np.flatnonzero(y == target_class) S_x = _safe_indexing(X, S_indices) S_y = _safe_indexing(y, S_indices) # fit knn on C self.estimator_.fit(C_x, C_y) good_classif_label = idx_maj_sample.copy() # Check each sample in S if we keep it or drop it for idx_sam, (x_sam, y_sam) in enumerate(zip(S_x, S_y)): # Do not select sample which are already well classified if idx_sam in good_classif_label: continue # Classify on S if not issparse(x_sam): x_sam = x_sam.reshape(1, -1) pred_y = self.estimator_.predict(x_sam) # If the prediction do not agree with the true label # append it in C_x if y_sam != pred_y: # Keep the index for later idx_maj_sample = np.append( idx_maj_sample, idx_maj[idx_sam] ) # Update C C_indices = np.append(C_indices, idx_maj[idx_sam]) C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # fit a knn on C self.estimator_.fit(C_x, C_y) # This experimental to speed up the search # Classify all the element in S and avoid to test the # well classified elements pred_S_y = self.estimator_.predict(S_x) good_classif_label = np.unique( np.append( idx_maj_sample, np.flatnonzero(pred_S_y == S_y) ) ) idx_under = np.concatenate((idx_under, idx_maj_sample), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0 ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py000066400000000000000000000401761366766276300337200ustar00rootroot00000000000000"""Class to perform under-sampling based on the edited nearest neighbour method.""" # Authors: Guillaume Lemaitre # Dayvid Oliveira # Christos Aridas # License: MIT from collections import Counter import numpy as np from scipy.stats import mode from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ...utils import check_neighbors_object from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._validation import _deprecate_positional_args SEL_KIND = ("all", "mode") @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class EditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the edited nearest neighbour method. This method will clean the database by removing samples close to the decision boundary. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with the samples of interest to not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order to exclude a sample. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. AllKNN : Undersample using ENN and various number of neighbours. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] D. Wilson, Asymptotic" Properties of Nearest Neighbor Rules Using Edited Data," In IEEE Transactions on Systems, Man, and Cybernetrics, vol. 2 (3), pp. 408-421, 1972. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ EditedNearestNeighbours # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> enn = EditedNearestNeighbours() >>> X_res, y_res = enn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=None ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs def _validate_estimator(self): """Validate the estimator created in the ENN.""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.kind_sel not in SEL_KIND: raise NotImplementedError def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0,), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): target_class_indices = np.flatnonzero(y == target_class) X_class = _safe_indexing(X, target_class_indices) y_class = _safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False )[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == "mode": nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class elif self.kind_sel == "all": nnhood_label = nnhood_label == target_class nnhood_bool = np.all(nnhood_label, axis=1) index_target_class = np.flatnonzero(nnhood_bool) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the repeated edited nearest neighbour method. This method will repeat several time the ENN algorithm. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. max_iter : int, default=100 Maximum number of iterations of the edited nearest neighbours algorithm for a single run. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with the samples of interest to not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order to exclude a sample. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 n_iter_ : int Number of iterations run. .. versionadded:: 0.6 See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. EditedNearestNeighbours : Undersample by editing samples. AllKNN : Undersample using ENN and various number of neighbours. Notes ----- The method is based on [1]_. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. Supports multi-class resampling. References ---------- .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ RepeatedEditedNearestNeighbours # doctest : +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> renn = RepeatedEditedNearestNeighbours() >>> X_res, y_res = renn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", n_neighbors=3, max_iter=100, kind_sel="all", n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs self.max_iter = max_iter def _validate_estimator(self): """Private function to create the NN estimator""" if self.max_iter < 2: raise ValueError( "max_iter must be greater than 1." " Got {} instead.".format(type(self.max_iter)) ) self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs, ) def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y self.sample_indices_ = np.arange(X.shape[0], dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) for n_iter in range(self.max_iter): prev_len = y_.shape[0] X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If there is no changes for the vector y # 2. If the number of samples in the other class become inferior to # the number of samples in the majority class # 3. If one of the class is disappearing # Case 1 b_conv = prev_len == y_enn.shape[0] # Case 2 stats_enn = Counter(y_enn) count_non_min = np.array( [ val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ] ) b_min_bec_maj = np.any( count_non_min < target_stats[class_minority] ) # Case 3 b_remove_maj_class = len(stats_enn) < len(target_stats) X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ self.enn_.sample_indices_ ] if b_conv or b_min_bec_maj or b_remove_maj_class: if b_conv: X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ self.enn_.sample_indices_ ] break self.n_iter_ = n_iter + 1 X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled def _more_tags(self): return {"sample_indices": True} @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. This method will apply ENN several time and will vary the number of nearest neighbours. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with the samples of interest to not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order to exclude a sample. allow_minority : bool, default=False If ``True``, it allows the majority classes to become the minority class without early stopping. .. versionadded:: 0.3 {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. EditedNearestNeighbours: Under-sampling by editing samples. RepeatedEditedNearestNeighbours: Under-sampling by repeating ENN. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ AllKNN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> allknn = AllKNN() >>> X_res, y_res = allknn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", n_neighbors=3, kind_sel="all", allow_minority=False, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.allow_minority = allow_minority self.n_jobs = n_jobs def _validate_estimator(self): """Create objects required by AllKNN""" if self.kind_sel not in SEL_KIND: raise NotImplementedError self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs, ) def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) self.sample_indices_ = np.arange(X.shape[0], dtype=int) for curr_size_ngh in range(1, self.nn_.n_neighbors): self.enn_.n_neighbors = curr_size_ngh X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If the number of samples in the other class become inferior to # the number of samples in the majority class # 2. If one of the class is disappearing # Case 1else: stats_enn = Counter(y_enn) count_non_min = np.array( [ val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ] ) b_min_bec_maj = np.any( count_non_min < target_stats[class_minority] ) if self.allow_minority: # overwrite b_min_bec_maj b_min_bec_maj = False # Case 2 b_remove_maj_class = len(stats_enn) < len(target_stats) X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ self.enn_.sample_indices_ ] if b_min_bec_maj or b_remove_maj_class: break X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py000066400000000000000000000136661366766276300342670ustar00rootroot00000000000000"""Class to perform under-sampling based on the instance hardness threshold.""" # Authors: Guillaume Lemaitre # Dayvid Oliveira # Christos Aridas # License: MIT from collections import Counter import numpy as np from sklearn.base import ClassifierMixin, clone from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_val_predict from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._docstring import _random_state_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class InstanceHardnessThreshold(BaseUnderSampler): """Undersample based on the instance hardness threshold. Read more in the :ref:`User Guide `. Parameters ---------- estimator : object, default=None Classifier to be used to estimate instance hardness of the samples. By default a :class:`sklearn.ensemble.RandomForestClassifier` will be used. If ``str``, the choices using a string are the following: ``'knn'``, ``'decision-tree'``, ``'random-forest'``, ``'adaboost'``, ``'gradient-boosting'`` and ``'linear-svm'``. If object, an estimator inherited from :class:`sklearn.base.ClassifierMixin` and having an attribute :func:`predict_proba`. {sampling_strategy} {random_state} cv : int, default=5 Number of folds to be used when estimating samples' instance hardness. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- NearMiss : Undersample based on near-miss search. RandomUnderSampler : Random under-sampling. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] D. Smith, Michael R., Tony Martinez, and Christophe Giraud-Carrier. "An instance level analysis of data complexity." Machine learning 95.2 (2014): 225-256. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS Resampled dataset shape Counter({{1: 5..., 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, estimator=None, sampling_strategy="auto", random_state=None, cv=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.cv = cv self.n_jobs = n_jobs def _validate_estimator(self, random_state): """Private function to create the classifier""" if ( self.estimator is not None and isinstance(self.estimator, ClassifierMixin) and hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) _set_random_states(self.estimator_, random_state) elif self.estimator is None: self.estimator_ = RandomForestClassifier( n_estimators=100, random_state=self.random_state, n_jobs=self.n_jobs, ) else: raise ValueError( "Invalid parameter `estimator`. Got {}.".format( type(self.estimator) ) ) def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) self._validate_estimator(random_state) target_stats = Counter(y) skf = StratifiedKFold( n_splits=self.cv, shuffle=True, random_state=random_state, ) probabilities = cross_val_predict( self.estimator_, X, y, cv=skf, n_jobs=self.n_jobs, method='predict_proba' ) probabilities = probabilities[range(len(y)), y] idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] threshold = np.percentile( probabilities[y == target_class], (1.0 - (n_samples / target_stats[target_class])) * 100.0, ) index_target_class = np.flatnonzero( probabilities[y == target_class] >= threshold ) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_nearmiss.py000066400000000000000000000237261366766276300303370ustar00rootroot00000000000000"""Class to perform under-sampling based on nearmiss methods.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import warnings from collections import Counter import numpy as np from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import check_neighbors_object from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} version : int, default=1 Version of the NearMiss to use. Possible values are 1, 2 or 3. n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the average distance to the minority point samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. n_neighbors_ver3 : int or object, default=3 If ``int``, NearMiss-3 algorithm start by a phase of re-sampling. This parameter correspond to the number of neighbours selected create the subset in which the selection will be performed. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- RandomUnderSampler : Random undersample the dataset. InstanceHardnessThreshold : Use of classifier to undersample a dataset. Notes ----- The methods are based on [1]_. Supports multi-class resampling. References ---------- .. [1] I. Mani, I. Zhang. "kNN approach to unbalanced data distributions: a case study involving information extraction," In Proceedings of workshop on learning from imbalanced datasets, 2003. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ NearMiss # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> nm = NearMiss() >>> X_res, y_res = nm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", version=1, n_neighbors=3, n_neighbors_ver3=3, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 self.n_jobs = n_jobs def _selection_dist_based( self, X, y, dist_vec, num_samples, key, sel_strategy="nearest" ): """Select the appropriate samples depending of the strategy selected. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Original samples. y : array-like, shape (n_samples,) Associated label to X. dist_vec : ndarray, shape (n_samples, ) The distance matrix to the nearest neigbour. num_samples: int The desired number of samples to select. key : str or int, The target class. sel_strategy : str, optional (default='nearest') Strategy to select the samples. Either 'nearest' or 'farthest' Returns ------- idx_sel : ndarray, shape (num_samples,) The list of the indices of the selected samples. """ # Compute the distance considering the farthest neighbour dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) target_class_indices = np.flatnonzero(y == key) if ( dist_vec.shape[0] != _safe_indexing(X, target_class_indices).shape[0] ): raise RuntimeError( "The samples to be selected do not correspond" " to the distance matrix given. Ensure that" " both `X[y == key]` and `dist_vec` are" " related." ) # Sort the list of distance and get the index if sel_strategy == "nearest": sort_way = False elif sel_strategy == "farthest": sort_way = True else: raise NotImplementedError sorted_idx = sorted( range(len(dist_avg_vec)), key=dist_avg_vec.__getitem__, reverse=sort_way, ) # Throw a warning to tell the user that we did not have enough samples # to select and that we just select everything if len(sorted_idx) < num_samples: warnings.warn( "The number of the samples to be selected is larger" " than the number of samples available. The" " balancing ratio cannot be ensure and all samples" " will be returned." ) # Select the desired number of samples return sorted_idx[:num_samples] def _validate_estimator(self): """Private function to create the NN estimator""" self.nn_ = check_neighbors_object("n_neighbors", self.n_neighbors) self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.version == 3: self.nn_ver3_ = check_neighbors_object( "n_neighbors_ver3", self.n_neighbors_ver3 ) self.nn_ver3_.set_params(**{"n_jobs": self.n_jobs}) if self.version not in (1, 2, 3): raise ValueError( "Parameter `version` must be 1, 2 or 3, got" " {}".format(self.version) ) def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0,), dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) minority_class_indices = np.flatnonzero(y == class_minority) self.nn_.fit(_safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] target_class_indices = np.flatnonzero(y == target_class) X_class = _safe_indexing(X, target_class_indices) y_class = _safe_indexing(y, target_class_indices) if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=self.nn_.n_neighbors ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, sel_strategy="nearest", ) elif self.version == 2: dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=target_stats[class_minority] ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, sel_strategy="nearest", ) elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( _safe_indexing(X, minority_class_indices) ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) X_class_selected = _safe_indexing( X_class, idx_vec_farthest) y_class_selected = _safe_indexing( y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors ) index_target_class = self._selection_dist_based( X_class_selected, y_class_selected, dist_vec, n_samples, target_class, sel_strategy="farthest", ) # idx_tmp is relative to the feature selected in the # previous step and we need to find the indirection index_target_class = idx_vec_farthest[index_target_class] else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return { "sample_indices": True, "_xfail_checks": { "check_samplers_fit_resample": "Fails for NearMiss-3 with less samples than expected" } } imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py000066400000000000000000000151711366766276300342340ustar00rootroot00000000000000"""Class performing under-sampling based on the neighbourhood cleaning rule.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np from scipy.stats import mode from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ._edited_nearest_neighbours import EditedNearestNeighbours from ...utils import check_neighbors_object from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._validation import _deprecate_positional_args SEL_KIND = ("all", "mode") @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class NeighbourhoodCleaningRule(BaseCleaningSampler): """Undersample based on the neighbourhood cleaning rule. This class uses ENN and a k-NN to remove noisy samples from the datasets. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. kind_sel : {{"all", "mode"}}, default='all' Strategy to use in order to exclude samples in the ENN sampling. - If ``'all'``, all neighbours will have to agree with the samples of interest to not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order to exclude a sample. threshold_cleaning : float, default=0.5 Threshold used to whether consider a class or not during the cleaning after applying ENN. A class will be considered during cleaning when: Ci > C x T , where Ci and C is the number of samples in the class and the data set, respectively and theta is the threshold. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. Notes ----- See the original paper: [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] J. Laurikkala, "Improving identification of difficult small classes by balancing class distribution," Springer Berlin Heidelberg, 2001. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ NeighbourhoodCleaningRule # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ncr = NeighbourhoodCleaningRule() >>> X_res, y_res = ncr.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 877, 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", n_neighbors=3, kind_sel="all", threshold_cleaning=0.5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.threshold_cleaning = threshold_cleaning self.n_jobs = n_jobs def _validate_estimator(self): """Create the objects required by NCR.""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.kind_sel not in SEL_KIND: raise NotImplementedError if self.threshold_cleaning > 1 or self.threshold_cleaning < 0: raise ValueError( "'threshold_cleaning' is a value between 0 and 1." " Got {} instead.".format(self.threshold_cleaning) ) def _fit_resample(self, X, y): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel="mode", n_jobs=self.n_jobs, ) enn.fit_resample(X, y) index_not_a1 = enn.sample_indices_ index_a1 = np.ones(y.shape, dtype=bool) index_a1[index_not_a1] = False index_a1 = np.flatnonzero(index_a1) # clean the neighborhood target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # compute which classes to consider for cleaning for the A2 group classes_under_sample = [ c for c, n_samples in target_stats.items() if ( c in self.sampling_strategy_.keys() and (n_samples > X.shape[0] * self.threshold_cleaning) ) ] self.nn_.fit(X) class_minority_indices = np.flatnonzero(y == class_minority) X_class = _safe_indexing(X, class_minority_indices) y_class = _safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == "mode": nnhood_label_majority, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label_majority) == y_class elif self.kind_sel == "all": nnhood_label_majority = nnhood_label == class_minority nnhood_bool = np.all(nnhood_label, axis=1) else: raise NotImplementedError # compute a2 group index_a2 = np.ravel(nnhood_idx[~nnhood_bool]) index_a2 = np.unique( [index for index in index_a2 if y[index] in classes_under_sample] ) union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) selected_samples = np.ones(y.shape, dtype=bool) selected_samples[union_a1_a2] = False self.sample_indices_ = np.flatnonzero(selected_samples) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py000066400000000000000000000146741366766276300325160ustar00rootroot00000000000000"""Class to perform under-sampling based on one-sided selection method.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, _safe_indexing from ..base import BaseCleaningSampler from ._tomek_links import TomekLinks from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._docstring import _random_state_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class OneSidedSelection(BaseCleaningSampler): """Class to perform under-sampling based on one-sided selection method. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int or object, default=None If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. n_seeds_S : int, default=1 Number of samples to extract in order to build the set S. {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-one scheme is used when sampling a class as proposed in [1]_. For each class to be sampled, all samples of this class and the minority class are used during the sampling procedure. References ---------- .. [1] M. Kubat, S. Matwin, "Addressing the curse of imbalanced training sets: one-sided selection," In ICML, vol. 97, pp. 179-186, 1997. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ OneSidedSelection # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> oss = OneSidedSelection(random_state=42) >>> X_res, y_res = oss.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 496, 0: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( n_neighbors=1, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, int): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) else: raise ValueError( "`n_neighbors` has to be a int or an object" " inherited from KNeighborsClassifier." " Got {} instead.".format(type(self.n_neighbors)) ) def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) sel_idx_maj = random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S ) idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = _safe_indexing(X, idx_maj_extracted) S_y = _safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] idx_under = np.concatenate( (idx_under, idx_maj_sample, idx_tmp), axis=0 ) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0 ) X_resampled = _safe_indexing(X, idx_under) y_resampled = _safe_indexing(y, idx_under) # apply Tomek cleaning tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = _safe_indexing(idx_under, tl.sample_indices_) return X_cleaned, y_cleaned def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py000066400000000000000000000075061366766276300327140ustar00rootroot00000000000000"""Class to perform random under-sampling.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. Under-sample the majority class(es) by randomly picking samples with or without replacement. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} replacement : bool, default=False Whether the sample is with or without replacement. Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- NearMiss : Undersample using near-miss samples. Notes ----- Supports multi-class resampling by sampling each class independently. Supports heterogeneous data as object array containing string and numeric data. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> rus = RandomUnderSampler(random_state=42) >>> X_res, y_res = rus.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, replacement=False ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.replacement = replacement def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False, ) return X, y, binarize_y def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement, ) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return { "X_types": ["2darray", "string"], "sample_indices": True, "allow_nan": True, } imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/_tomek_links.py000066400000000000000000000106741366766276300310330ustar00rootroot00000000000000"""Class to perform under-sampling by removing Tomek's links.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # License: MIT import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class TomekLinks(BaseCleaningSampler): """Under-sampling by removing Tomek's links. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {n_jobs} Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 See Also -------- EditedNearestNeighbours : Undersample by samples edition. CondensedNearestNeighbour : Undersample by samples condensation. RandomUnderSampling : Randomly under-sample the dataset. Notes ----- This method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] I. Tomek, "Two modifications of CNN," In Systems, Man, and Cybernetics, IEEE Transactions on, vol. 6, pp 769-772, 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ TomekLinks # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> tl = TomekLinks() >>> X_res, y_res = tl.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 897, 0: 100}}) """ @_deprecate_positional_args def __init__(self, *, sampling_strategy="auto", n_jobs=None): super().__init__(sampling_strategy=sampling_strategy) self.n_jobs = n_jobs @staticmethod def is_tomek(y, nn_index, class_type): """Detect if samples are Tomek's link. More precisely, it uses the target vector and the first neighbour of every sample point and looks for Tomek pairs. Returning a boolean vector with True for majority Tomek links. Parameters ---------- y : ndarray of shape (n_samples,) Target vector of the data set, necessary to keep track of whether a sample belongs to minority or not. nn_index : ndarray of shape (len(y),) The index of the closes nearest neighbour to a sample point. class_type : int or str The label of the minority class. Returns ------- is_tomek : ndarray of shape (len(y), ) Boolean vector on len( # samples ), with True for majority samples that are Tomek links. """ links = np.zeros(len(y), dtype=bool) # find which class to not consider class_excluded = [c for c in np.unique(y) if c not in class_type] # there is a Tomek link between two samples if they are both nearest # neighbors of each others. for index_sample, target_sample in enumerate(y): if target_sample in class_excluded: continue if y[nn_index[index_sample]] != target_sample: if nn_index[nn_index[index_sample]] == index_sample: links[index_sample] = True return links def _fit_resample(self, X, y): # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.sampling_strategy_) self.sample_indices_ = np.flatnonzero(np.logical_not(links)) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests/000077500000000000000000000000001366766276300271355ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests/__init__.py000066400000000000000000000000001366766276300312340ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py000066400000000000000000000211061366766276300320250ustar00rootroot00000000000000"""Test the module repeated edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.neighbors import NearestNeighbors from sklearn.datasets import make_classification from imblearn.under_sampling import AllKNN X = np.array( [ [-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [-0.35946678, 0.72510189], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [0.2096964, -0.61814058], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797], ] ) Y = np.array( [ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, ] ) R_TOL = 1e-4 def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) def test_all_knn_allow_minority(): X, y = make_classification( n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], class_sep=0.4, random_state=0, ) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_resample(X, y) allknn = AllKNN() X_res_2, y_res_2 = allknn.fit_resample(X, y) assert len(y_res_1) < len(y_res_2) def test_allknn_fit_resample_mode(): allknn = AllKNN(kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_allknn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_alknn_not_good_object(): nn = "rnd" allknn = AllKNN(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): allknn.fit_resample(X, Y) test_condensed_nearest_neighbour.py000066400000000000000000000062641366766276300362240ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module condensed nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import CondensedNearestNeighbour RND_SEED = 0 X = np.array( [ [2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], [-0.10903849, -0.12085181], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.03394306, 0.03986753], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [0.01936241, 0.17799828], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [-0.01252787, 0.34102657], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [0.05230552, 0.09043907], [0.15198585, 0.12512646], [0.70524765, 0.39816382], ] ) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) def test_cnn_init(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert cnn.n_seeds_S == 1 assert cnn.n_jobs is None def test_cnn_fit_resample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_cnn_fit_resample_with_object(): knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_cnn_fit_resample_with_wrong_object(): knn = "rnd" cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with pytest.raises(ValueError, match="has to be a int or an "): cnn.fit_resample(X, Y) test_edited_nearest_neighbours.py000066400000000000000000000074441366766276300357040ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import EditedNearestNeighbours X = np.array( [ [2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], [-0.10903849, -0.12085181], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.03394306, 0.03986753], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [0.01936241, 0.17799828], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [-0.01252787, 0.34102657], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [0.05230552, 0.09043907], [0.15198585, 0.12512646], [0.70524765, 0.39816382], ] ) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) def test_enn_init(): enn = EditedNearestNeighbours() assert enn.n_neighbors == 3 assert enn.kind_sel == "all" assert enn.n_jobs is None def test_enn_fit_resample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648], ] ) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_mode(): enn = EditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_not_good_object(): nn = "rnd" enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError, match="has to be one of"): enn.fit_resample(X, Y) test_instance_hardness_threshold.py000066400000000000000000000060661366766276300362460ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module .""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB as NB from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold RND_SEED = 0 X = np.array( [ [-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [0.20246714, -0.34727125], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], ] ) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED) def test_iht_init(): sampling_strategy = "auto" iht = InstanceHardnessThreshold( estimator=ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED, ) assert iht.sampling_strategy == sampling_strategy assert iht.random_state == RND_SEED def test_iht_fit_resample(): iht = InstanceHardnessThreshold( estimator=ESTIMATOR, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( estimator=NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) def test_iht_fit_resample_class_obj(): est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with pytest.raises(ValueError, match="Invalid parameter `estimator`"): iht.fit_resample(X, Y) def test_iht_reproducibility(): from sklearn.datasets import load_digits X_digits, y_digits = load_digits(return_X_y=True) idx_sampled = [] for seed in range(5): est = RandomForestClassifier(n_estimators=10, random_state=seed) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) iht.fit_resample(X_digits, y_digits) idx_sampled.append(iht.sample_indices_.copy()) for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]): assert_array_equal(idx_1, idx_2) imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py000066400000000000000000000166411366766276300323770ustar00rootroot00000000000000"""Test the module nearmiss.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import NearMiss X = np.array( [ [1.17737838, -0.2002118], [0.4960075, 0.86130762], [-0.05903827, 0.10947647], [0.91464286, 1.61369212], [-0.54619583, 1.73009918], [-0.60413357, 0.24628718], [0.45713638, 1.31069295], [-0.04032409, 3.01186964], [0.03142011, 0.12323596], [0.50701028, -0.17636928], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [0.99272351, -0.11631728], [-1.95581933, 0.69609604], [1.15157493, -1.2981518], ] ) Y = np.array([1, 2, 1, 0, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) VERSION_NEARMISS = (1, 2, 3) @pytest.mark.parametrize( "nearmiss_params, err_msg", [ ({"version": 1000}, "must be 1, 2 or 3"), ({"version": 1, "n_neighbors": "rnd"}, "has to be one of"), ( { "version": 3, "n_neighbors": NearestNeighbors(n_neighbors=3), "n_neighbors_ver3": "rnd", }, "has to be one of", ), ], ) def test_nearmiss_error(nearmiss_params, err_msg): nm = NearMiss(**nearmiss_params) with pytest.raises(ValueError, match=err_msg): nm.fit_resample(X, Y) def test_nm_fit_resample_auto(): sampling_strategy = "auto" X_gt = [ np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_resample_float_sampling_strategy(): sampling_strategy = {0: 3, 1: 4, 2: 4} X_gt = [ np.array( [ [-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [0.91464286, 1.61369212], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728], ] ), np.array( [ [-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [0.91464286, 1.61369212], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [-0.05903827, 0.10947647], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], [0.45713638, 1.31069295], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_resample_nn_obj(): sampling_strategy = "auto" nn = NearestNeighbors(n_neighbors=3) X_gt = [ np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss( sampling_strategy=sampling_strategy, version=version, n_neighbors=nn, ) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) test_neighbourhood_cleaning_rule.py000066400000000000000000000052671366766276300362240ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module neighbourhood cleaning rule.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import NeighbourhoodCleaningRule X = np.array( [ [1.57737838, 0.1997882], [0.8960075, 0.46130762], [0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.14619583, 1.33009918], [-0.20413357, 0.64628718], [0.85713638, 0.91069295], [0.35967591, 2.61186964], [0.43142011, 0.52323596], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518], ] ) Y = np.array([1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) @pytest.mark.parametrize( "ncr_params, err_msg", [ ({"threshold_cleaning": -10}, "value between 0 and 1"), ({"threshold_cleaning": 10}, "value between 0 and 1"), ({"n_neighbors": "rnd"}, "has to be one of"), ], ) def test_ncr_error(ncr_params, err_msg): ncr = NeighbourhoodCleaningRule(**ncr_params) with pytest.raises(ValueError, match=err_msg): ncr.fit_resample(X, Y) def test_ncr_fit_resample(): ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array( [ [0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518], ] ) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ncr_fit_resample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel="mode") X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array( [ [0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518], ] ) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) test_one_sided_selection.py000066400000000000000000000062161366766276300344720ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module one-sided selection.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import OneSidedSelection RND_SEED = 0 X = np.array( [ [-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [0.20246714, -0.34727125], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], ] ) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) def test_oss_init(): oss = OneSidedSelection(random_state=RND_SEED) assert oss.n_seeds_S == 1 assert oss.n_jobs is None assert oss.random_state == RND_SEED def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array( [ [-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125], ] ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array( [ [-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125], ] ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_oss_with_wrong_object(): knn = "rnd" oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with pytest.raises(ValueError, match="has to be a int"): oss.fit_resample(X, Y) test_random_under_sampler.py000066400000000000000000000073171366766276300346770ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module random under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np import pytest from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import RandomUnderSampler RND_SEED = 0 X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) @pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array']) def test_rus_fit_resample(as_frame): if as_frame: pd = pytest.importorskip("pandas") X_ = pd.DataFrame(X) else: X_ = X rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X_, Y) X_gt = np.array( [ [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826], ] ) y_gt = np.array([0, 0, 0, 1, 1, 1]) if as_frame: assert hasattr(X_resampled, "loc") X_resampled = X_resampled.to_numpy() assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True, ) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array( [ [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186], ] ) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array( [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object ) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) X_res, y_res = rus.fit_resample(X_hetero, y) assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 assert X_res.dtype == object def test_random_under_sampling_nan_inf(): # check that we can undersample even with missing or infinite data # regression tests for #605 rng = np.random.RandomState(42) n_not_finite = X.shape[0] // 3 row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) col_indices = rng.randint(0, X.shape[1], size=n_not_finite) not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) X_ = X.copy() X_[row_indices, col_indices] = not_finite_values rus = RandomUnderSampler(random_state=0) X_res, y_res = rus.fit_resample(X_, Y) assert y_res.shape == (6,) assert X_res.shape == (6, 2) assert np.any(~np.isfinite(X_res)) test_repeated_edited_nearest_neighbours.py000066400000000000000000000220641366766276300375500ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests"""Test the module repeated edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest import numpy as np from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import RepeatedEditedNearestNeighbours X = np.array( [ [-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [-0.35946678, 0.72510189], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [0.2096964, -0.61814058], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797], ] ) Y = np.array( [ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, ] ) def test_renn_init(): renn = RepeatedEditedNearestNeighbours() assert renn.n_neighbors == 3 assert renn.kind_sel == "all" assert renn.n_jobs is None def test_renn_iter_wrong(): max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) with pytest.raises(ValueError): renn.fit_resample(X, Y) def test_renn_fit_resample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode_object(): renn = RepeatedEditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_not_good_object(): nn = "rnd" renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): renn.fit_resample(X, Y) @pytest.mark.parametrize( "max_iter, n_iter", [ (2, 2), (5, 3), ], ) def test_renn_iter_attribute(max_iter, n_iter): renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) renn.fit_resample(X, Y) assert renn.n_iter_ == n_iter imbalanced-learn-0.7.0/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py000066400000000000000000000040601366766276300330650ustar00rootroot00000000000000"""Test the module Tomek's links.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import TomekLinks X = np.array( [ [0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.06514042, -0.0770537], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.27410027, -0.54194484], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) def test_tl_init(): tl = TomekLinks() assert tl.n_jobs is None def test_tl_fit_resample(): tl = TomekLinks() X_resampled, y_resampled = tl.fit_resample(X, Y) X_gt = np.array( [ [0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) imbalanced-learn-0.7.0/imblearn/under_sampling/base.py000066400000000000000000000062461366766276300230160ustar00rootroot00000000000000""" Base class for the under-sampling method. """ # Authors: Guillaume Lemaitre # License: MIT from ..base import BaseSampler class BaseUnderSampler(BaseSampler): """Base class for under-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "under-sampling" _sampling_strategy_docstring = """sampling_strategy : float, str, dict, callable, default='auto' Sampling information to sample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{us} = N_{m} / N_{rM}` where :math:`N_{m}` is the number of samples in the minority class and :math:`N_{rM}` is the number of samples in the majority class after resampling. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not minority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.rstrip() class BaseCleaningSampler(BaseSampler): """Base class for under-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "clean-sampling" _sampling_strategy_docstring = """sampling_strategy : str, list or callable Sampling information to sample the data set. - When ``str``, specify the class targeted by the resampling. Note the the number of samples will not be equal in each. Possible choices are: ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not minority'``. - When ``list``, the list contains the classes targeted by the resampling. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.rstrip() imbalanced-learn-0.7.0/imblearn/utils/000077500000000000000000000000001366766276300176535ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/utils/__init__.py000066400000000000000000000005611366766276300217660ustar00rootroot00000000000000""" The :mod:`imblearn.utils` module includes various utilities. """ from ._docstring import Substitution from ._validation import check_neighbors_object from ._validation import check_target_type from ._validation import check_sampling_strategy __all__ = [ "check_neighbors_object", "check_sampling_strategy", "check_target_type", "Substitution", ] imbalanced-learn-0.7.0/imblearn/utils/_docstring.py000066400000000000000000000027141366766276300223640ustar00rootroot00000000000000"""Utilities for docstring in imbalanced-learn.""" # Authors: Guillaume Lemaitre # License: MIT class Substitution: """Decorate a function's or a class' docstring to perform string substitution on it. This decorator should be robust even if obj.__doc__ is None (for example, if -OO was passed to the interpreter) """ def __init__(self, *args, **kwargs): if args and kwargs: raise AssertionError("Only positional or keyword args are allowed") self.params = args or kwargs def __call__(self, obj): obj.__doc__ = obj.__doc__.format(**self.params) return obj _random_state_docstring = """random_state : int, RandomState instance, default=None Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. """.rstrip() _n_jobs_docstring = """n_jobs : int, default=None Number of CPU cores used during the cross-validation loop. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See `Glossary `_ for more details. """.rstrip() imbalanced-learn-0.7.0/imblearn/utils/_show_versions.py000066400000000000000000000044531366766276300233020ustar00rootroot00000000000000""" Utility method which prints system info to help with debugging, and filing issues on GitHub. Adapted from :func:`sklearn.show_versions`, which was adapted from :func:`pandas.show_versions` """ # Author: Alexander L. Hayes # License: MIT import sys import importlib def _get_deps_info(): """Overview of the installed version of main dependencies Returns ------- deps_info: dict version information on relevant Python libraries """ deps = [ "pip", "setuptools", "imblearn", "sklearn", "numpy", "scipy", "Cython", "pandas", "keras", "tensorflow", "joblib", ] def get_version(module): return module.__version__ deps_info = {} for modname in deps: try: if modname in sys.modules: mod = sys.modules[modname] else: mod = importlib.import_module(modname) ver = get_version(mod) deps_info[modname] = ver except ImportError: deps_info[modname] = None return deps_info def show_versions(github=False): """Print debugging information. Parameters ---------- github : bool, If true, wrap system info with GitHub markup. """ from sklearn.utils._show_versions import _get_sys_info _sys_info = _get_sys_info() _deps_info = _get_deps_info() _github_markup = ( "
" "System, Dependency Information\n\n" "**System Information**\n\n" "{0}\n" "**Python Dependencies**\n\n" "{1}\n" "
" ) if github: _sys_markup = "" _deps_markup = "" for k, stat in _sys_info.items(): _sys_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) for k, stat in _deps_info.items(): _deps_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) print(_github_markup.format(_sys_markup, _deps_markup)) else: print("\nSystem:") for k, stat in _sys_info.items(): print("{k:>11}: {stat}".format(k=k, stat=stat)) print("\nPython dependencies:") for k, stat in _deps_info.items(): print("{k:>11}: {stat}".format(k=k, stat=stat)) imbalanced-learn-0.7.0/imblearn/utils/_validation.py000066400000000000000000000547661366766276300225400ustar00rootroot00000000000000"""Utilities for input validation""" # Authors: Guillaume Lemaitre # License: MIT import warnings from collections import OrderedDict from functools import wraps from inspect import signature, Parameter from numbers import Integral, Real import numpy as np from sklearn.base import clone from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils import column_or_1d from sklearn.utils.multiclass import type_of_target from ..exceptions import raise_isinstance_error SAMPLING_KIND = ( "over-sampling", "under-sampling", "clean-sampling", "ensemble", "bypass", ) TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") class ArraysTransformer: """A class to convert sampler output arrays to their original types.""" def __init__(self, X, y): self.x_props = self._gets_props(X) self.y_props = self._gets_props(y) def transform(self, X, y): X = self._transfrom_one(X, self.x_props) y = self._transfrom_one(y, self.y_props) return X, y def _gets_props(self, array): props = {} props["type"] = array.__class__.__name__ props["columns"] = getattr(array, "columns", None) props["name"] = getattr(array, "name", None) props["dtypes"] = getattr(array, "dtypes", None) return props def _transfrom_one(self, array, props): type_ = props["type"].lower() if type_ == "list": ret = array.tolist() elif type_ == "dataframe": import pandas as pd ret = pd.DataFrame(array, columns=props["columns"]) ret = ret.astype(props["dtypes"]) elif type_ == "series": import pandas as pd ret = pd.Series(array, dtype=props["dtypes"], name=props["name"]) else: ret = array return ret def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. Several methods in imblearn relies on NN. Until version 0.4, these objects can be passed at initialisation as an integer or a KNeighborsMixin. After only KNeighborsMixin will be accepted. This utility allows for type checking and raise if the type is wrong. Parameters ---------- nn_name : str, The name associated to the object to raise an error if needed. nn_object : int or KNeighborsMixin, The object to be checked additional_neighbor : int, optional (default=0) Sometimes, some algorithm need an additional neighbors. Returns ------- nn_object : KNeighborsMixin The k-NN object. """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) elif isinstance(nn_object, KNeighborsMixin): return clone(nn_object) else: raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) def _count_class_sample(y): unique, counts = np.unique(y, return_counts=True) return dict(zip(unique, counts)) def check_target_type(y, indicate_one_vs_all=False): """Check the target types to be conform to the current samplers. The current samplers should be compatible with ``'binary'``, ``'multilabel-indicator'`` and ``'multiclass'`` targets only. Parameters ---------- y : ndarray, The array containing the target. indicate_one_vs_all : bool, optional Either to indicate if the targets are encoded in a one-vs-all fashion. Returns ------- y : ndarray, The returned target. is_one_vs_all : bool, optional Indicate if the target was originally encoded in a one-vs-all fashion. Only returned if ``indicate_multilabel=True``. """ type_y = type_of_target(y) if type_y == "multilabel-indicator": if np.any(y.sum(axis=1) > 1): raise ValueError( "Imbalanced-learn currently supports binary, multiclass and " "binarized encoded multiclasss targets. Multilabel and " "multioutput targets are not supported." ) y = y.argmax(axis=1) else: y = column_or_1d(y) return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y def _sampling_strategy_all(y, sampling_type): """Returns sampling target by targeting all classes.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): n_sample_minority = min(target_stats.values()) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_majority(y, sampling_type): """Returns sampling target by targeting the majority class only.""" if sampling_type == "over-sampling": raise ValueError( "'sampling_strategy'='majority' cannot be used with" " over-sampler." ) elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): target_stats = _count_class_sample(y) class_majority = max(target_stats, key=target_stats.get) n_sample_minority = min(target_stats.values()) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key == class_majority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_not_majority(y, sampling_type): """Returns sampling target by targeting all classes but not the majority.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_majority } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): n_sample_minority = min(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key != class_majority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_not_minority(y, sampling_type): """Returns sampling target by targeting all classes but not the minority.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_minority } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key != class_minority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_minority(y, sampling_type): """Returns sampling target by targeting the minority class only.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key == class_minority } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): raise ValueError( "'sampling_strategy'='minority' cannot be used with" " under-sampler and clean-sampler." ) else: raise NotImplementedError return sampling_strategy def _sampling_strategy_auto(y, sampling_type): """Returns sampling target auto for over-sampling and not-minority for under-sampling.""" if sampling_type == "over-sampling": return _sampling_strategy_not_majority(y, sampling_type) elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): return _sampling_strategy_not_minority(y, sampling_type) def _sampling_strategy_dict(sampling_strategy, y, sampling_type): """Returns sampling target by converting the dictionary depending of the sampling.""" target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy.keys()) - set( target_stats.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( "The {} target class is/are not present in the" " data.".format(set_diff_sampling_strategy_target) ) # check that there is no negative number if any(n_samples < 0 for n_samples in sampling_strategy.values()): raise ValueError( "The number of samples in a class cannot be negative." "'sampling_strategy' contains some negative value: {}".format( sampling_strategy ) ) sampling_strategy_ = {} if sampling_type == "over-sampling": n_samples_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) for class_sample, n_samples in sampling_strategy.items(): if n_samples < target_stats[class_sample]: raise ValueError( "With over-sampling methods, the number" " of samples in a class should be greater" " or equal to the original number of samples." " Originally, there is {} samples and {}" " samples are asked.".format( target_stats[class_sample], n_samples ) ) if n_samples > n_samples_majority: warnings.warn( "After over-sampling, the number of samples ({})" " in class {} will be larger than the number of" " samples in the majority class (class #{} ->" " {})".format( n_samples, class_sample, class_majority, n_samples_majority, ) ) sampling_strategy_[class_sample] = ( n_samples - target_stats[class_sample] ) elif sampling_type == "under-sampling": for class_sample, n_samples in sampling_strategy.items(): if n_samples > target_stats[class_sample]: raise ValueError( "With under-sampling methods, the number of" " samples in a class should be less or equal" " to the original number of samples." " Originally, there is {} samples and {}" " samples are asked.".format( target_stats[class_sample], n_samples ) ) sampling_strategy_[class_sample] = n_samples elif sampling_type == "clean-sampling": raise ValueError( "'sampling_strategy' as a dict for cleaning methods is " "not supported. Please give a list of the classes to be " "targeted by the sampling." ) else: raise NotImplementedError return sampling_strategy_ def _sampling_strategy_list(sampling_strategy, y, sampling_type): """With cleaning methods, sampling_strategy can be a list to target the class of interest.""" if sampling_type != "clean-sampling": raise ValueError( "'sampling_strategy' cannot be a list for samplers " "which are not cleaning methods." ) target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy) - set( target_stats.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( "The {} target class is/are not present in the" " data.".format(set_diff_sampling_strategy_target) ) return { class_sample: min(target_stats.values()) for class_sample in sampling_strategy } def _sampling_strategy_float(sampling_strategy, y, sampling_type): """Take a proportion of the majority (over-sampling) or minority (under-sampling) class in binary classification.""" type_y = type_of_target(y) if type_y != "binary": raise ValueError( '"sampling_strategy" can be a float only when the type ' "of target is binary. For multi-class, use a dict." ) target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_majority * sampling_strategy - value) for (key, value) in target_stats.items() if key != class_majority } if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]): raise ValueError( "The specified ratio required to remove samples " "from the minority class while trying to " "generate new samples. Please increase the " "ratio." ) elif sampling_type == "under-sampling": n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_minority / sampling_strategy) for (key, value) in target_stats.items() if key != class_minority } if any( [ n_samples > target_stats[target] for target, n_samples in sampling_strategy_.items() ] ): raise ValueError( "The specified ratio required to generate new " "sample in the majority class while trying to " "remove samples. Please increase the ratio." ) else: raise ValueError( "'clean-sampling' methods do let the user " "specify the sampling ratio." ) return sampling_strategy_ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): """Sampling target validation for samplers. Checks that ``sampling_strategy`` is of consistent type and return a dictionary containing each targeted class with its corresponding number of sample. It is used in :class:`imblearn.base.BaseSampler`. Parameters ---------- sampling_strategy : float, str, dict, list or callable, Sampling information to sample the data set. - When ``float``: For **under-sampling methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and :math:`N_{m}` are the number of samples in the majority class after resampling and the number of samples in the minority class, respectively; For **over-sampling methods**, it correspond to the ratio :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{m}` where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the minority class after resampling and the number of samples in the majority class, respectively. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification and with cleaning samplers. - When ``str``, specify the class targeted by the resampling. For **under- and over-sampling methods**, the number of samples in the different classes will be equalized. For **cleaning methods**, the number of samples will not be equal. Possible choices are: ``'minority'``: resample only the minority class; ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: for under-sampling methods, equivalent to ``'not minority'`` and for over-sampling methods, equivalent to ``'not majority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. .. warning:: ``dict`` is available for both **under- and over-sampling methods**. An error is raised with **cleaning methods**. Use a ``list`` instead. - When ``list``, the list contains the targeted classes. It used only for **cleaning methods**. .. warning:: ``list`` is available for **cleaning methods**. An error is raised with **under- and over-sampling methods**. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. y : ndarray, shape (n_samples,) The target array. sampling_type : str, The type of sampling. Can be either ``'over-sampling'``, ``'under-sampling'``, or ``'clean-sampling'``. kwargs : dict, optional Dictionary of additional keyword arguments to pass to ``sampling_strategy`` when this is a callable. Returns ------- sampling_strategy_converted : dict, The converted and validated sampling target. Returns a dictionary with the key being the class target and the value being the desired number of samples. """ if sampling_type not in SAMPLING_KIND: raise ValueError( "'sampling_type' should be one of {}. Got '{}'" " instead.".format(SAMPLING_KIND, sampling_type) ) if np.unique(y).size <= 1: raise ValueError( "The target 'y' needs to have more than 1 class." " Got {} class instead".format(np.unique(y).size) ) if sampling_type in ("ensemble", "bypass"): return sampling_strategy if isinstance(sampling_strategy, str): if sampling_strategy not in SAMPLING_TARGET_KIND.keys(): raise ValueError( "When 'sampling_strategy' is a string, it needs" " to be one of {}. Got '{}' instead.".format( SAMPLING_TARGET_KIND, sampling_strategy ) ) return OrderedDict( sorted( SAMPLING_TARGET_KIND[sampling_strategy]( y, sampling_type ).items() ) ) elif isinstance(sampling_strategy, dict): return OrderedDict( sorted( _sampling_strategy_dict( sampling_strategy, y, sampling_type ).items() ) ) elif isinstance(sampling_strategy, list): return OrderedDict( sorted( _sampling_strategy_list( sampling_strategy, y, sampling_type ).items() ) ) elif isinstance(sampling_strategy, Real): if sampling_strategy <= 0 or sampling_strategy > 1: raise ValueError( "When 'sampling_strategy' is a float, it should be " "in the range (0, 1]. Got {} instead.".format( sampling_strategy ) ) return OrderedDict( sorted( _sampling_strategy_float( sampling_strategy, y, sampling_type ).items() ) ) elif callable(sampling_strategy): sampling_strategy_ = sampling_strategy(y, **kwargs) return OrderedDict( sorted( _sampling_strategy_dict( sampling_strategy_, y, sampling_type ).items() ) ) SAMPLING_TARGET_KIND = { "minority": _sampling_strategy_minority, "majority": _sampling_strategy_majority, "not minority": _sampling_strategy_not_minority, "not majority": _sampling_strategy_not_majority, "all": _sampling_strategy_all, "auto": _sampling_strategy_auto, } def _deprecate_positional_args(f): """Decorator for methods that issues warnings for positional arguments Using the keyword-only argument syntax in pep 3102, arguments after the * will issue a warning when passed as a positional argument. Parameters ---------- f : function function to check arguments on """ sig = signature(f) kwonly_args = [] all_args = [] for name, param in sig.parameters.items(): if param.kind == Parameter.POSITIONAL_OR_KEYWORD: all_args.append(name) elif param.kind == Parameter.KEYWORD_ONLY: kwonly_args.append(name) @wraps(f) def inner_f(*args, **kwargs): extra_args = len(args) - len(all_args) if extra_args > 0: # ignore first 'self' argument for instance methods args_msg = ['{}={}'.format(name, arg) for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])] warnings.warn("Pass {} as keyword args. From version 0.9 " "passing these as positional arguments will " "result in an error".format(", ".join(args_msg)), FutureWarning) kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) return f(**kwargs) return inner_f imbalanced-learn-0.7.0/imblearn/utils/deprecation.py000066400000000000000000000035261366766276300225300ustar00rootroot00000000000000"""Utilities for deprecation""" # Authors: Guillaume Lemaitre # License: MIT import warnings def deprecate_parameter( sampler, version_deprecation, param_deprecated, new_param=None ): """Helper to deprecate a parameter by another one. Parameters ---------- sampler : object, The object which will be inspected. version_deprecation : str, The version from which the parameter will be deprecated. The format should be ``'x.y'`` param_deprecated : str, The parameter being deprecated. new_param : str, The parameter used instead of the deprecated parameter. By default, no parameter is expected. Returns ------- None """ x, y = version_deprecation.split(".") version_removed = x + "." + str(int(y) + 2) if new_param is None: if getattr(sampler, param_deprecated) is not None: warnings.warn( "'{}' is deprecated from {} and will be removed in" " {} for the estimator {}.".format( param_deprecated, version_deprecation, version_removed, sampler.__class__, ), category=DeprecationWarning, ) else: if getattr(sampler, param_deprecated) is not None: warnings.warn( "'{}' is deprecated from {} and will be removed in" " {} for the estimator {}. Use '{}' instead.".format( param_deprecated, version_deprecation, version_removed, sampler.__class__, new_param, ), category=DeprecationWarning, ) setattr(sampler, new_param, getattr(sampler, param_deprecated)) imbalanced-learn-0.7.0/imblearn/utils/estimator_checks.py000066400000000000000000000323261366766276300235620ustar00rootroot00000000000000"""Utils to check the samplers and compatibility with scikit-learn""" # Adapated from scikit-learn # Authors: Guillaume Lemaitre # License: MIT import sys import traceback import warnings from collections import Counter from functools import partial import pytest import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.datasets import ( fetch_openml, make_classification, make_multilabel_classification, ) # noqa from sklearn.cluster import KMeans from sklearn.exceptions import SkipTestWarning from sklearn.preprocessing import label_binarize from sklearn.utils.estimator_checks import _mark_xfail_checks from sklearn.utils.estimator_checks import _set_check_estimator_ids from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_raises_regex from sklearn.utils.multiclass import type_of_target from imblearn.datasets import make_imbalance from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler def _set_checking_parameters(estimator): params = estimator.get_params() name = estimator.__class__.__name__ if "n_estimators" in params: estimator.set_params(n_estimators=min(5, estimator.n_estimators)) if name == "ClusterCentroids": estimator.set_params( voting="soft", estimator=KMeans(random_state=0, algorithm="full"), ) if name == "KMeansSMOTE": estimator.set_params(kmeans_estimator=12) def _yield_sampler_checks(sampler): yield check_target_type yield check_samplers_one_label yield check_samplers_fit yield check_samplers_fit_resample yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype yield check_samplers_sample_indices yield check_samplers_2d_target def _yield_classifier_checks(classifier): yield check_classifier_on_multilabel_or_multioutput_targets yield check_classifiers_with_encoded_labels def _yield_all_checks(estimator): name = estimator.__class__.__name__ tags = estimator._get_tags() if tags["_skip_test"]: warnings.warn( f"Explicit SKIP via _skip_test tag for estimator {name}.", SkipTestWarning ) return # trigger our checks if this is a SamplerMixin if hasattr(estimator, "fit_resample"): for check in _yield_sampler_checks(estimator): yield check if hasattr(estimator, "predict"): for check in _yield_classifier_checks(estimator): yield check def parametrize_with_checks(estimators): """Pytest specific decorator for parametrizing estimator checks. The `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. This allows to use `pytest -k` to specify which tests to run:: pytest test_check_estimators.py -k check_estimators_fit_returns_self Parameters ---------- estimators : list of estimators instances Estimators to generated checks for. Returns ------- decorator : `pytest.mark.parametrize` Examples -------- >>> from sklearn.utils.estimator_checks import parametrize_with_checks >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.tree import DecisionTreeRegressor >>> @parametrize_with_checks([LogisticRegression(), ... DecisionTreeRegressor()]) ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) """ names = (type(estimator).__name__ for estimator in estimators) checks_generator = ((clone(estimator), partial(check, name)) for name, estimator in zip(names, estimators) for check in _yield_all_checks(estimator)) checks_with_marks = ( _mark_xfail_checks(estimator, check, pytest) for estimator, check in checks_generator) return pytest.mark.parametrize("estimator, check", checks_with_marks, ids=_set_check_estimator_ids) def check_target_type(name, estimator): # should raise warning if the target is continuous (we cannot raise error) X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) msg = "Unknown label type: 'continuous'" assert_raises_regex( ValueError, msg, estimator.fit_resample, X, y, ) # if the target is multilabel then we should raise an error rng = np.random.RandomState(42) y = rng.randint(2, size=(20, 3)) msg = "Multilabel and multioutput targets are not supported." assert_raises_regex( ValueError, msg, estimator.fit_resample, X, y, ) def check_samplers_one_label(name, sampler): error_string_fit = "Sampler can't balance when only one class is present." X = np.random.random((20, 2)) y = np.zeros(20) try: sampler.fit_resample(X, y) except ValueError as e: if "class" not in repr(e): print(error_string_fit, sampler.__class__.__name__, e) traceback.print_exc(file=sys.stdout) raise e else: return except Exception as exc: print(error_string_fit, traceback, exc) traceback.print_exc(file=sys.stdout) raise exc raise AssertionError(error_string_fit) def check_samplers_fit(name, sampler): np.random.seed(42) # Make this test reproducible X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) sampler.fit_resample(X, y) assert hasattr( sampler, "sampling_strategy_" ), "No fitted attribute sampling_strategy_" def check_samplers_fit_resample(name, sampler): X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) target_stats = Counter(y) X_res, y_res = sampler.fit_resample(X, y) if isinstance(sampler, BaseOverSampler): target_stats_res = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseUnderSampler): n_samples = min(target_stats.values()) if name == "InstanceHardnessThreshold": # IHT does not enforce the number of samples but provide a number # of samples the closest to the desired target. assert all( Counter(y_res)[k] <= target_stats[k] for k in target_stats.keys() ) else: assert all(value == n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseCleaningSampler): target_stats_res = Counter(y_res) class_minority = min(target_stats, key=target_stats.get) assert all( target_stats[class_sample] > target_stats_res[class_sample] for class_sample in target_stats.keys() if class_sample != class_minority ) def check_samplers_sampling_strategy_fit_resample(name, sampler): # in this test we will force all samplers to not change the class 1 X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) expected_stat = Counter(y)[1] if isinstance(sampler, BaseOverSampler): sampling_strategy = {2: 498, 0: 498} sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseUnderSampler): sampling_strategy = {2: 201, 0: 201} sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseCleaningSampler): sampling_strategy = [2, 0] sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat def check_samplers_sparse(name, sampler): # check that sparse matrices can be passed through the sampler leading to # the same results than dense X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) X_sparse = sparse.csr_matrix(X) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) X_res, y_res = sampler.fit_resample(X, y) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res) assert_allclose(y_res_sparse, y_res) def check_samplers_pandas(name, sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y_df = pd.DataFrame(y) y_s = pd.Series(y, name="class") X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, pd.DataFrame) assert isinstance(y_res_df, pd.DataFrame) assert isinstance(y_res_s, pd.Series) assert X_df.columns.to_list() == X_res_df.columns.to_list() assert y_df.columns.to_list() == y_res_df.columns.to_list() assert y_s.name == y_res_s.name assert_allclose(X_res_df.to_numpy(), X_res) assert_allclose(y_res_df.to_numpy().ravel(), y_res) assert_allclose(y_res_s.to_numpy(), y_res) def check_samplers_list(name, sampler): # Check that the can samplers handle simple lists X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) X_list = X.tolist() y_list = y.tolist() X_res, y_res = sampler.fit_resample(X, y) X_res_list, y_res_list = sampler.fit_resample(X_list, y_list) assert isinstance(X_res_list, list) assert isinstance(y_res_list, list) assert_allclose(X_res, X_res_list) assert_allclose(y_res, y_res_list) def check_samplers_multiclass_ova(name, sampler): # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) y_ova = label_binarize(y, np.unique(y)) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) assert type_of_target(y_res_ova) == type_of_target(y_ova) assert_allclose(y_res, y_res_ova.argmax(axis=1)) def check_samplers_2d_target(name, sampler): X, y = make_classification( n_samples=100, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) y = y.reshape(-1, 1) # Make the target 2d sampler.fit_resample(X, y) def check_samplers_preserve_dtype(name, sampler): X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" def check_samplers_sample_indices(name, sampler): X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) sampler.fit_resample(X, y) sample_indices = sampler._get_tags().get("sample_indices", None) if sample_indices: assert hasattr(sampler, "sample_indices_") is sample_indices else: assert not hasattr(sampler, "sample_indices_") def check_classifier_on_multilabel_or_multioutput_targets(name, estimator): X, y = make_multilabel_classification(n_samples=30) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): estimator.fit(X, y) def check_classifiers_with_encoded_labels(name, classifier): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pytest.importorskip("pandas") df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True) df, y = make_imbalance( df, y, sampling_strategy={ "Iris-setosa": 30, "Iris-versicolor": 20, "Iris-virginica": 50, } ) classifier.set_params( sampling_strategy={ "Iris-setosa": 20, "Iris-virginica": 20, } ) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df) assert set(y_pred) == set(y.cat.categories.tolist()) imbalanced-learn-0.7.0/imblearn/utils/testing.py000066400000000000000000000115441366766276300217070ustar00rootroot00000000000000"""Test utilities.""" # Adapted from scikit-learn # Authors: Guillaume Lemaitre # License: MIT import inspect import pkgutil from contextlib import contextmanager from importlib import import_module from re import compile from pathlib import Path from operator import itemgetter from pytest import warns as _warns from sklearn.base import BaseEstimator from sklearn.utils._testing import ignore_warnings def all_estimators(type_filter=None,): """Get a list of all estimators from imblearn. This function crawls the module and gets all classes that inherit from BaseEstimator. Classes that are defined in test-modules are not included. By default meta_estimators are also not included. This function is adapted from sklearn. Parameters ---------- type_filter : string, list of string, or None, default=None Which kind of estimators should be returned. If None, no filter is applied and all estimators are returned. Possible values are 'sampler' to get estimators only of these specific types, or a list of these to get the estimators that fit at least one of the types. Returns ------- estimators : list of tuples List of (name, class), where ``name`` is the class name as string and ``class`` is the actual type of the class. """ from ..base import SamplerMixin def is_abstract(c): if not (hasattr(c, "__abstractmethods__")): return False if not len(c.__abstractmethods__): return False return True all_classes = [] modules_to_ignore = {"tests"} root = str(Path(__file__).parent.parent) # Ignore deprecation warnings triggered at import time and from walking # packages with ignore_warnings(category=FutureWarning): for importer, modname, ispkg in pkgutil.walk_packages( path=[root], prefix='imblearn.'): mod_parts = modname.split(".") if (any(part in modules_to_ignore for part in mod_parts) or '._' in modname): continue module = import_module(modname) classes = inspect.getmembers(module, inspect.isclass) classes = [(name, est_cls) for name, est_cls in classes if not name.startswith("_")] all_classes.extend(classes) all_classes = set(all_classes) estimators = [ c for c in all_classes if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator") ] # get rid of abstract base classes estimators = [c for c in estimators if not is_abstract(c[1])] # get rid of sklearn estimators which have been imported in some classes estimators = [c for c in estimators if "sklearn" not in c[1].__module__] if type_filter is not None: if not isinstance(type_filter, list): type_filter = [type_filter] else: type_filter = list(type_filter) # copy filtered_estimators = [] filters = {"sampler": SamplerMixin} for name, mixin in filters.items(): if name in type_filter: type_filter.remove(name) filtered_estimators.extend( [est for est in estimators if issubclass(est[1], mixin)] ) estimators = filtered_estimators if type_filter: raise ValueError( "Parameter type_filter must be 'sampler' or " "None, got" " %s." % repr(type_filter) ) # drop duplicates, sort for reproducibility # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0)) @contextmanager def warns(expected_warning, match=None): r"""Assert that a warning is raised with an optional matching pattern Assert that a code block/function call warns ``expected_warning`` and raise a failure exception otherwise. It can be used within a context manager ``with``. Parameters ---------- expected_warning : Warning Warning type. match : regex str or None, optional The pattern to be matched. By default, no check is done. Returns ------- None Examples -------- >>> import warnings >>> from imblearn.utils.testing import warns >>> with warns(UserWarning, match=r'must be \d+$'): ... warnings.warn("value must be 42", UserWarning) """ with _warns(expected_warning) as record: yield if match is not None: for each in record: if compile(match).search(str(each.message)) is not None: break else: msg = "'{}' pattern not found in {}".format( match, "{}".format([str(r.message) for r in record]) ) assert False, msg else: pass imbalanced-learn-0.7.0/imblearn/utils/tests/000077500000000000000000000000001366766276300210155ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/utils/tests/__init__.py000066400000000000000000000000001366766276300231140ustar00rootroot00000000000000imbalanced-learn-0.7.0/imblearn/utils/tests/test_deprecation.py000066400000000000000000000011001366766276300247130ustar00rootroot00000000000000"""Test for the deprecation helper""" # Authors: Guillaume Lemaitre # License: MIT from imblearn.utils.deprecation import deprecate_parameter from imblearn.utils.testing import warns class Sampler: def __init__(self): self.a = "something" self.b = "something" def test_deprecate_parameter(): with warns(DeprecationWarning, match="is deprecated from"): deprecate_parameter(Sampler(), "0.2", "a") with warns(DeprecationWarning, match="Use 'b' instead."): deprecate_parameter(Sampler(), "0.2", "a", "b") imbalanced-learn-0.7.0/imblearn/utils/tests/test_docstring.py000066400000000000000000000023001366766276300244150ustar00rootroot00000000000000"""Test utilities for docstring.""" # Authors: Guillaume Lemaitre # License: MIT import pytest from imblearn.utils import Substitution from imblearn.utils._docstring import _random_state_docstring from imblearn.utils._docstring import _n_jobs_docstring func_docstring = """A function. Parameters ---------- xxx yyy """ def func(param_1, param_2): """A function. Parameters ---------- {param_1} {param_2} """ return param_1, param_2 cls_docstring = """A class. Parameters ---------- xxx yyy """ class cls: """A class. Parameters ---------- {param_1} {param_2} """ def __init__(self, param_1, param_2): self.param_1 = param_1 self.param_2 = param_2 @pytest.mark.parametrize( "obj, obj_docstring", [(func, func_docstring), (cls, cls_docstring)] ) def test_docstring_inject(obj, obj_docstring): obj_injected_docstring = Substitution(param_1="xxx", param_2="yyy")(obj) assert obj_injected_docstring.__doc__ == obj_docstring def test_docstring_template(): assert "random_state" in _random_state_docstring assert "n_jobs" in _n_jobs_docstring imbalanced-learn-0.7.0/imblearn/utils/tests/test_estimator_checks.py000066400000000000000000000052231366766276300257570ustar00rootroot00000000000000import pytest import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_X_y from sklearn.utils.multiclass import check_classification_targets from imblearn.base import BaseSampler from imblearn.utils.estimator_checks import check_target_type from imblearn.utils.estimator_checks import check_samplers_one_label from imblearn.utils.estimator_checks import check_samplers_fit from imblearn.utils.estimator_checks import check_samplers_sparse from imblearn.utils.estimator_checks import check_samplers_preserve_dtype class BaseBadSampler(BaseEstimator): """Sampler without inputs checking.""" _sampling_type = "bypass" def fit(self, X, y): return self def fit_resample(self, X, y): check_classification_targets(y) self.fit(X, y) return X, y class SamplerSingleClass(BaseSampler): """Sampler that would sample even with a single class.""" _sampling_type = "bypass" def fit_resample(self, X, y): return self._fit_resample(X, y) def _fit_resample(self, X, y): return X, y class NotFittedSampler(BaseBadSampler): """Sampler without target checking.""" def fit(self, X, y): X, y = self._validate_data(X, y) return self class NoAcceptingSparseSampler(BaseBadSampler): """Sampler which does not accept sparse matrix.""" def fit(self, X, y): X, y = self._validate_data(X, y) self.sampling_strategy_ = "sampling_strategy_" return self class NotPreservingDtypeSampler(BaseSampler): _sampling_type = "bypass" def _fit_resample(self, X, y): return X.astype(np.float64), y.astype(np.int64) mapping_estimator_error = { "BaseBadSampler": (AssertionError, "ValueError not raised by fit"), "SamplerSingleClass": (AssertionError, "Sampler can't balance when only"), "NotFittedSampler": (AssertionError, "No fitted attribute"), "NoAcceptingSparseSampler": (TypeError, "A sparse matrix was passed"), "NotPreservingDtypeSampler": (AssertionError, "X dtype is not preserved"), } def _test_single_check(Estimator, check): estimator = Estimator() name = estimator.__class__.__name__ err_type, err_msg = mapping_estimator_error[name] with pytest.raises(err_type, match=err_msg): check(name, estimator) def test_all_checks(): _test_single_check(BaseBadSampler, check_target_type) _test_single_check(SamplerSingleClass, check_samplers_one_label) _test_single_check(NotFittedSampler, check_samplers_fit) _test_single_check(NoAcceptingSparseSampler, check_samplers_sparse) _test_single_check( NotPreservingDtypeSampler, check_samplers_preserve_dtype ) imbalanced-learn-0.7.0/imblearn/utils/tests/test_show_versions.py000066400000000000000000000034341366766276300253420ustar00rootroot00000000000000"""Test for the show_versions helper. Based on the sklearn tests.""" # Author: Alexander L. Hayes # License: MIT from imblearn.utils._show_versions import _get_deps_info from imblearn.utils._show_versions import show_versions def test_get_deps_info(): _deps_info = _get_deps_info() assert "pip" in _deps_info assert "setuptools" in _deps_info assert "imblearn" in _deps_info assert "sklearn" in _deps_info assert "numpy" in _deps_info assert "scipy" in _deps_info assert "Cython" in _deps_info assert "pandas" in _deps_info assert "joblib" in _deps_info def test_show_versions_default(capsys): show_versions() out, err = capsys.readouterr() assert "python" in out assert "executable" in out assert "machine" in out assert "pip" in out assert "setuptools" in out assert "imblearn" in out assert "sklearn" in out assert "numpy" in out assert "scipy" in out assert "Cython" in out assert "pandas" in out assert "keras" in out assert "tensorflow" in out assert "joblib" in out def test_show_versions_github(capsys): show_versions(github=True) out, err = capsys.readouterr() assert "
System, Dependency Information" in out assert "**System Information**" in out assert "* python" in out assert "* executable" in out assert "* machine" in out assert "**Python Dependencies**" in out assert "* pip" in out assert "* setuptools" in out assert "* imblearn" in out assert "* sklearn" in out assert "* numpy" in out assert "* scipy" in out assert "* Cython" in out assert "* pandas" in out assert "* keras" in out assert "* tensorflow" in out assert "* joblib" in out assert "
" in out imbalanced-learn-0.7.0/imblearn/utils/tests/test_testing.py000066400000000000000000000033231366766276300241040ustar00rootroot00000000000000"""Test for the testing module""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from pytest import raises from imblearn.base import SamplerMixin from imblearn.utils.testing import all_estimators from imblearn.utils.testing import warns def test_all_estimators(): # check if the filtering is working with a list or a single string type_filter = "sampler" all_estimators(type_filter=type_filter) type_filter = ["sampler"] estimators = all_estimators(type_filter=type_filter) for estimator in estimators: # check that all estimators are sampler assert issubclass(estimator[1], SamplerMixin) # check that an error is raised when the type is unknown type_filter = "rnd" with raises(ValueError, match="Parameter type_filter must be 'sampler'"): all_estimators(type_filter=type_filter) def test_warns(): import warnings with warns(UserWarning, match=r"must be \d+$"): warnings.warn("value must be 42", UserWarning) with raises(AssertionError, match="pattern not found"): with warns(UserWarning, match=r"must be \d+$"): warnings.warn("this is not here", UserWarning) with warns(UserWarning, match=r"aaa"): warnings.warn("cccccccccc", UserWarning) warnings.warn("bbbbbbbbbb", UserWarning) warnings.warn("aaaaaaaaaa", UserWarning) a, b, c = ("aaa", "bbbbbbbbbb", "cccccccccc") expected_msg = r"'{}' pattern not found in \['{}', '{}'\]".format(a, b, c) with raises(AssertionError, match=expected_msg): with warns(UserWarning, match=r"aaa"): warnings.warn("bbbbbbbbbb", UserWarning) warnings.warn("cccccccccc", UserWarning) imbalanced-learn-0.7.0/imblearn/utils/tests/test_validation.py000066400000000000000000000332771366766276300245740ustar00rootroot00000000000000"""Test for the validation helper""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter from collections import OrderedDict import pytest import numpy as np from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type from imblearn.utils._validation import ArraysTransformer from imblearn.utils._validation import _deprecate_positional_args multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) binary_target = np.array([1] * 25 + [0] * 100) def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 1 estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 2 estimator = NearestNeighbors(n_neighbors=n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors n_neighbors = "rnd" with pytest.raises(ValueError, match="has to be one of"): check_neighbors_object(name, n_neighbors) @pytest.mark.parametrize( "target, output_target", [ (np.array([0, 1, 1]), np.array([0, 1, 1])), (np.array([0, 1, 2]), np.array([0, 1, 2])), (np.array([[0, 1], [1, 0]]), np.array([1, 0])), ], ) def test_check_target_type(target, output_target): converted_target = check_target_type(target.astype(int)) assert_array_equal(converted_target, output_target.astype(int)) @pytest.mark.parametrize( "target, output_target, is_ova", [ (np.array([0, 1, 1]), np.array([0, 1, 1]), False), (np.array([0, 1, 2]), np.array([0, 1, 2]), False), (np.array([[0, 1], [1, 0]]), np.array([1, 0]), True), ], ) def test_check_target_type_ova(target, output_target, is_ova): converted_target, binarize_target = check_target_type( target.astype(int), indicate_one_vs_all=True ) assert_array_equal(converted_target, output_target.astype(int)) assert binarize_target == is_ova def test_check_sampling_strategy_warning(): msg = "dict for cleaning methods is not supported" with pytest.raises(ValueError, match=msg): check_sampling_strategy( {1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling" ) @pytest.mark.parametrize( "ratio, y, type, err_msg", [ ( 0.5, binary_target, "clean-sampling", "'clean-sampling' methods do let the user specify the sampling ratio", # noqa ), ( 0.1, np.array([0] * 10 + [1] * 20), "over-sampling", "remove samples from the minority class while trying to generate new", # noqa ), ( 0.1, np.array([0] * 10 + [1] * 20), "under-sampling", "generate new sample in the majority class while trying to remove", ), ], ) def test_check_sampling_strategy_float_error(ratio, y, type, err_msg): with pytest.raises(ValueError, match=err_msg): check_sampling_strategy(ratio, y, type) def test_check_sampling_strategy_error(): with pytest.raises(ValueError, match="'sampling_type' should be one of"): check_sampling_strategy("auto", np.array([1, 2, 3]), "rnd") error_regex = "The target 'y' needs to have more than 1 class." with pytest.raises(ValueError, match=error_regex): check_sampling_strategy("auto", np.ones((10,)), "over-sampling") error_regex = "When 'sampling_strategy' is a string, it needs to be one of" with pytest.raises(ValueError, match=error_regex): check_sampling_strategy("rnd", np.array([1, 2, 3]), "over-sampling") @pytest.mark.parametrize( "sampling_strategy, sampling_type, err_msg", [ ("majority", "over-sampling", "over-sampler"), ("minority", "under-sampling", "under-sampler"), ], ) def test_check_sampling_strategy_error_wrong_string( sampling_strategy, sampling_type, err_msg ): with pytest.raises( ValueError, match=( "'{}' cannot be used with {}".format(sampling_strategy, err_msg) ), ): check_sampling_strategy( sampling_strategy, np.array([1, 2, 3]), sampling_type ) @pytest.mark.parametrize( "sampling_strategy, sampling_method", [ ({10: 10}, "under-sampling"), ({10: 10}, "over-sampling"), ([10], "clean-sampling"), ], ) def test_sampling_strategy_class_target_unknown( sampling_strategy, sampling_method ): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="are not present in the data."): check_sampling_strategy(sampling_strategy, y, sampling_method) def test_sampling_strategy_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: -100, 2: 50, 3: 25} with pytest.raises(ValueError, match="in a class cannot be negative."): check_sampling_strategy(sampling_strategy, y, "under-sampling") sampling_strategy = {1: 45, 2: 100, 3: 70} error_regex = ( "With over-sampling methods, the number of samples in a" " class should be greater or equal to the original number" " of samples. Originally, there is 50 samples and 45" " samples are asked." ) with pytest.raises(ValueError, match=error_regex): check_sampling_strategy(sampling_strategy, y, "over-sampling") error_regex = ( "With under-sampling methods, the number of samples in a" " class should be less or equal to the original number of" " samples. Originally, there is 25 samples and 70 samples" " are asked." ) with pytest.raises(ValueError, match=error_regex): check_sampling_strategy(sampling_strategy, y, "under-sampling") @pytest.mark.parametrize("sampling_strategy", [-10, 10]) def test_sampling_strategy_float_error_not_in_range(sampling_strategy): y = np.array([1] * 50 + [2] * 100) with pytest.raises(ValueError, match="it should be in the range"): check_sampling_strategy(sampling_strategy, y, "under-sampling") def test_sampling_strategy_float_error_not_binary(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="the type of target is binary"): sampling_strategy = 0.5 check_sampling_strategy(sampling_strategy, y, "under-sampling") @pytest.mark.parametrize( "sampling_method", ["over-sampling", "under-sampling"] ) def test_sampling_strategy_list_error_not_clean_sampling(sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="cannot be a list for samplers"): sampling_strategy = [1, 2, 3] check_sampling_strategy(sampling_strategy, y, sampling_method) def _sampling_strategy_func(y): # this function could create an equal number of samples target_stats = Counter(y) n_samples = max(target_stats.values()) return {key: int(n_samples) for key in target_stats.keys()} @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_sampling_strategy, target", [ ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_target), ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_target), ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_target), ("all", "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target), ("all", "under-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), ("all", "clean-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), ("majority", "under-sampling", {2: 25}, multiclass_target), ("majority", "clean-sampling", {2: 25}, multiclass_target), ("minority", "over-sampling", {3: 75}, multiclass_target), ("not minority", "over-sampling", {1: 50, 2: 0}, multiclass_target), ("not minority", "under-sampling", {1: 25, 2: 25}, multiclass_target), ("not minority", "clean-sampling", {1: 25, 2: 25}, multiclass_target), ("not majority", "over-sampling", {1: 50, 3: 75}, multiclass_target), ("not majority", "under-sampling", {1: 25, 3: 25}, multiclass_target), ("not majority", "clean-sampling", {1: 25, 3: 25}, multiclass_target), ( {1: 70, 2: 100, 3: 70}, "over-sampling", {1: 20, 2: 0, 3: 45}, multiclass_target, ), ( {1: 30, 2: 45, 3: 25}, "under-sampling", {1: 30, 2: 45, 3: 25}, multiclass_target, ), ([1], "clean-sampling", {1: 25}, multiclass_target), ( _sampling_strategy_func, "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target, ), (0.5, "over-sampling", {1: 25}, binary_target), (0.5, "under-sampling", {0: 50}, binary_target), ], ) def test_check_sampling_strategy( sampling_strategy, sampling_type, expected_sampling_strategy, target ): sampling_strategy_ = check_sampling_strategy( sampling_strategy, target, sampling_type ) assert sampling_strategy_ == expected_sampling_strategy def test_sampling_strategy_dict_over_sampling(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: 70, 2: 140, 3: 70} expected_msg = ( r"After over-sampling, the number of samples \(140\) in" r" class 2 will be larger than the number of samples in" r" the majority class \(class #2 -> 100\)" ) with warns(UserWarning, expected_msg): check_sampling_strategy(sampling_strategy, y, "over-sampling") def test_sampling_strategy_callable_args(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) multiplier = {1: 1.5, 2: 1, 3: 3} def sampling_strategy_func(y, multiplier): """samples such that each class will be affected by the multiplier.""" target_stats = Counter(y) return { key: int(values * multiplier[key]) for key, values in target_stats.items() } sampling_strategy_ = check_sampling_strategy( sampling_strategy_func, y, "over-sampling", multiplier=multiplier ) assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [ ( {3: 25, 1: 25, 2: 25}, "under-sampling", OrderedDict({1: 25, 2: 25, 3: 25}), ), ( {3: 100, 1: 100, 2: 100}, "over-sampling", OrderedDict({1: 50, 2: 0, 3: 75}), ), ], ) def test_sampling_strategy_check_order( sampling_strategy, sampling_type, expected_result ): # We pass on purpose a non sorted dictionary and check that the resulting # dictionary is sorted. Refer to issue #428. y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy_ = check_sampling_strategy( sampling_strategy, y, sampling_type ) assert sampling_strategy_ == expected_result def test_arrays_transformer_plain_list(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) arrays_transformer = ArraysTransformer(X.tolist(), y.tolist()) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, list) assert isinstance(y_res, list) def test_arrays_transformer_numpy(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) arrays_transformer = ArraysTransformer(X, y) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, np.ndarray) assert isinstance(y_res, np.ndarray) def test_arrays_transformer_pandas(): pd = pytest.importorskip("pandas") X = np.array([[0, 0], [1, 1]]) y = np.array([0, 1]) X_df = pd.DataFrame(X, columns=["a", "b"]) X_df = X_df.astype(int) y_df = pd.DataFrame(y, columns=["target", ]) y_df = y_df.astype(int) y_s = pd.Series(y, name="target", dtype=int) # DataFrame and DataFrame case arrays_transformer = ArraysTransformer(X_df, y_df) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, pd.DataFrame) assert_array_equal(X_res.columns, X_df.columns) assert_array_equal(X_res.dtypes, X_df.dtypes) assert isinstance(y_res, pd.DataFrame) assert_array_equal(y_res.columns, y_df.columns) assert_array_equal(y_res.dtypes, y_df.dtypes) # DataFrames and Series case arrays_transformer = ArraysTransformer(X_df, y_s) _, y_res = arrays_transformer.transform(X, y) assert isinstance(y_res, pd.Series) assert_array_equal(y_res.name, y_s.name) assert_array_equal(y_res.dtype, y_s.dtype) def test_deprecate_positional_args_warns_for_function(): @_deprecate_positional_args def f1(a, b, *, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass c=3 as keyword args"): f1(1, 2, 3) with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"): f1(1, 2, 3, 4) @_deprecate_positional_args def f2(a=1, *, b=1, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"): f2(1, 2) # The * is place before a keyword only argument without a default value @_deprecate_positional_args def f3(a, *, b, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"): f3(1, 2) imbalanced-learn-0.7.0/maint_tools/000077500000000000000000000000001366766276300172525ustar00rootroot00000000000000imbalanced-learn-0.7.0/maint_tools/test_docstring.py000066400000000000000000000157601366766276300226700ustar00rootroot00000000000000import re from inspect import signature from typing import Optional import pytest from imblearn.utils.testing import all_estimators numpydoc_validation = pytest.importorskip("numpydoc.validate") # List of whitelisted modules and methods; regexp are supported. # These docstrings will fail because they are inheriting from scikit-learn DOCSTRING_WHITELIST = [ "ADASYN$", "ADASYN.", "AllKNN$", "AllKNN.", "BalancedBaggingClassifier$", "BalancedBaggingClassifier.estimators_samples_", "BalancedBaggingClassifier.fit", "BalancedBaggingClassifier.get_params", "BalancedBaggingClassifier.predict", "BalancedBaggingClassifier.score", "BalancedBaggingClassifier.set_params", "BalancedRandomForestClassifier$", "BalancedRandomForestClassifier.apply", "BalancedRandomForestClassifier.feature_importances_", "BalancedRandomForestClassifier.fit", "BalancedRandomForestClassifier.predict$", "BalancedRandomForestClassifier.score", "BalancedRandomForestClassifier.set_params", "ClusterCentroids$", "ClusterCentroids.", "CondensedNearestNeighbour$", "CondensedNearestNeighbour.", "EasyEnsembleClassifier$", "EasyEnsembleClassifier.estimators_samples_", "EasyEnsembleClassifier.fit", "EasyEnsembleClassifier.get_params", "EasyEnsembleClassifier.predict", "EasyEnsembleClassifier.score", "EasyEnsembleClassifier.set_params", "EditedNearestNeighbours$", "EditedNearestNeighbours.", "FunctionSampler$", "FunctionSampler.", "InstanceHardnessThreshold$", "InstanceHardnessThreshold.", "SMOTE$", "SMOTE.", "NearMiss$", "NearMiss.", "NeighbourhoodCleaningRule$", "NeighbourhoodCleaningRule.", "OneSidedSelection$", "OneSidedSelection.", "Pipeline$", "Pipeline.fit$", "Pipeline.fit_transform", "Pipeline.fit_resample", "Pipeline.fit_predict", "RUSBoostClassifier$", "RUSBoostClassifier.", "RandomOverSampler$", "RandomOverSampler.", "RandomUnderSampler$", "RandomUnderSampler.", "TomekLinks$", "TomekLinks", ] def get_all_methods(): estimators = all_estimators() for name, Estimator in estimators: if name.startswith("_"): # skip private classes continue methods = [] for name in dir(Estimator): if name.startswith("_"): continue method_obj = getattr(Estimator, name) if (hasattr(method_obj, '__call__') or isinstance(method_obj, property)): methods.append(name) methods.append(None) for method in sorted(methods, key=lambda x: str(x)): yield Estimator, method def filter_errors(errors, method): """ Ignore some errors based on the method type. These rules are specific for scikit-learn.""" for code, message in errors: # We ignore following error code, # - RT02: The first line of the Returns section # should contain only the type, .. # (as we may need refer to the name of the returned # object) # - GL01: Docstring text (summary) should start in the line # immediately after the opening quotes (not in the same line, # or leaving a blank line in between) if code in ["RT02", "GL01"]: continue # Following codes are only taken into account for the # top level class docstrings: # - ES01: No extended summary found # - SA01: See Also section not found # - EX01: No examples section found if method is not None and code in ["EX01", "SA01", "ES01"]: continue yield code, message def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: """Pretty print original docstring and the obtained errors Parameters ---------- res : dict result of numpydoc.validate.validate estimator : {estimator, None} estimator object or None method : str if estimator is not None, either the method name or None. Returns ------- str String representation of the error. """ if method is None: if hasattr(estimator, "__init__"): method = "__init__" elif estimator is None: raise ValueError( "At least one of estimator, method should be provided" ) else: raise NotImplementedError if estimator is not None: obj = getattr(estimator, method) try: obj_signature = signature(obj) except TypeError: # In particular we can't parse the signature of properties obj_signature = ( "\nParsing of the method signature failed, " "possibly because this is a property." ) obj_name = estimator.__name__ + "." + method else: obj_signature = "" obj_name = method msg = "\n\n" + "\n\n".join( [ str(res["file"]), obj_name + str(obj_signature), res["docstring"], "# Errors", "\n".join( " - {}: {}".format(code, message) for code, message in res["errors"] ), ] ) return msg @pytest.mark.parametrize("Estimator, method", get_all_methods()) def test_docstring(Estimator, method, request): base_import_path = Estimator.__module__ import_path = [base_import_path, Estimator.__name__] if method is not None: import_path.append(method) import_path = ".".join(import_path) if not any(re.search(regex, import_path) for regex in DOCSTRING_WHITELIST): request.applymarker( pytest.mark.xfail( run=False, reason="TODO pass numpydoc validation" ) ) res = numpydoc_validation.validate(import_path) res["errors"] = list(filter_errors(res["errors"], method)) if res["errors"]: msg = repr_errors(res, Estimator, method) raise ValueError(msg) if __name__ == "__main__": import sys import argparse parser = argparse.ArgumentParser( description="Validate docstring with numpydoc." ) parser.add_argument("import_path", help="Import path to validate") args = parser.parse_args() res = numpydoc_validation.validate(args.import_path) import_path_sections = args.import_path.split(".") # When applied to classes, detect class method. For functions # method = None. # TODO: this detection can be improved. Currently we assume that we have # class # methods if the second path element before last is in camel case. if len(import_path_sections) >= 2 and re.match( r"(?:[A-Z][a-z]*)+", import_path_sections[-2] ): method = import_path_sections[-1] else: method = None res["errors"] = list(filter_errors(res["errors"], method)) if res["errors"]: msg = repr_errors(res, method=args.import_path) print(msg) sys.exit(1) else: print("All docstring checks passed for {}!".format(args.import_path)) imbalanced-learn-0.7.0/references.bib000066400000000000000000000122361366766276300175250ustar00rootroot00000000000000 @InProceedings{ batista2003, title = {Balancing training data for automated annotation of keywords: A case study}, author = {Batista, Gustavo E. A. P. A. and Bazzan, Ana L. C. and Monard, Maria Carolina}, booktitle = {Proceedings of the 2nd Brazilian Workshop on Bioinformatics}, pages = {10--18}, year = {2003}, month = {Dec.}, address = {Rio de Janeiro, Brazil} } @Article{ batista2004, title = {A study of the behavior of several methods for balancing machine learning training data}, author = {Batista, Gustavo E. A. P. A. and Prati, Ronaldo C. and Monard, Maria Carolina}, journal = {ACM Sigkdd Explorations Newsletter}, volume = {6}, number = {1}, pages = {20--29}, year = {2004}, publisher = {ACM} } @Article{ chawla2002, title = {SMOTE: Synthetic minority over-sampling technique}, author = {Chawla, Nitesh V. and Bowyer, Kevin W. and Hall, Lawrence O. and Kegelmeyer, W. Philip}, journal = {Journal of Artificial Intelligence Research}, volume = {16}, pages = {321--357}, year = {2002} } @InProceedings{ han2005, title = {Borderline-SMOTE: A new over-sampling method in imbalanced data sets learning}, author = {Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan}, journal = {Advances in intelligent computing}, pages = {878--887}, year = {2005}, booktitle = {Proceedings of the 1st International Conference on Intelligent Computing}, month = {Aug.}, address = {Hefei, China} } @Article{ hart1968, title = {The condensed nearest neighbor rule}, author = {Hart, Peter E.}, journal = {IEEE Transactions on Information Theory}, volume = {14}, number = {3}, pages = {515--516}, year = {1968}, publisher = {IEEE} } @InProceedings{ he2008, title = {ADASYN: Adaptive synthetic sampling approach for imbalanced learning}, author = {He, Haibo and Bai, Yang and Garcia, Edwardo A. and Li, Shutao}, booktitle = {Proceedings of the 5th IEEE International Joint Conference on Neural Networks}, pages = {1322--1328}, year = {2008}, organization = {IEEE}, month = {Jun.}, address = {Hong Kong, China} } @InProceedings{ kubat1997, title = {Addressing the curse of imbalanced training sets: One-sided selection}, author = {Kubat, Miroslav and Matwin, Stan}, booktitle = {Proceedings of the 14th International Conference on Machine Learning}, volume = {97}, pages = {179--186}, year = {1997}, address = {Nashville, Tennessee, USA}, month = {July} } @InProceedings{ laurikkala2001, title = {Improving identification of difficult small classes by balancing class distribution}, author = {Laurikkala, Jorma}, journal = {Proceedings of the 8th Conference on Artificial Intelligence in Medicine in Europe}, pages = {63--66}, address = {Cascais, Portugal}, month = {Jul.}, year = {2001}, publisher = {Springer} } @Article{ liu2009, title = {Exploratory undersampling for class-imbalance learning}, author = {Liu, Xu-Ying and Wu, Jianxin and Zhou, Zhi-Hua}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, volume = {39}, number = {2}, pages = {539--550}, year = {2009}, publisher = {IEEE} } @InProceedings{ mani2003, title = {kNN approach to unbalanced data distributions: A case study involving information extraction}, author = {Mani, Inderjeet and Zhang, Jianping}, booktitle = {Proceedings of the Workshop on Learning from Imbalanced Data Sets}, volume = {126}, year = {2003}, month = {Aug.}, pages = {1--7}, address = {Washington, DC, USA} } @InProceedings{ nguyen2009, title = {Borderline over-sampling for imbalanced data classification}, author = {Nguyen, Hien M. and Cooper, Eric W. and Kamei, Katsuari}, journal = {Proceedings of the 5th International Workshop on computational Intelligence and Applications}, pages = {24--29}, year = {2009} } @Article{ smith2014, title = {An instance level analysis of data complexity}, author = {Smith, Michael R. and Martinez, Tony and Giraud-Carrier, Christophe}, journal = {Machine learning}, volume = {95}, number = {2}, pages = {225--256}, year = {2014}, publisher = {Springer} } @Article{ tomek1976a, title = {Two modifications of CNN}, author = {Tomek, Ivan}, journal = {IEEE Trans. Systems, Man and Cybernetics}, volume = {6}, issue = {6}, pages = {769--772}, year = {1976} } @Article{ tomek1976b, title = {An experiment with the edited nearest-neighbor rule}, author = {Tomek, Ivan}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, number = {6}, issue = {6}, pages = {448--452}, year = {1976} } @Article{ wilson1972, title = {Asymptotic properties of nearest neighbor rules using edited data}, author = {Wilson, Dennis L.}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, volume = {2}, number = {3}, pages = {408--421}, year = {1972}, publisher = {IEEE} } @article{chen2004using, title={Using random forest to learn imbalanced data}, author={Chen, Chao and Liaw, Andy and Breiman, Leo}, journal={University of California, Berkeley}, volume={110}, pages={1--12}, year={2004} } imbalanced-learn-0.7.0/requirements.optional.txt000066400000000000000000000000211366766276300220230ustar00rootroot00000000000000keras tensorflow imbalanced-learn-0.7.0/requirements.test.txt000066400000000000000000000000441366766276300211620ustar00rootroot00000000000000matplotlib>=2.0.0 pytest pytest-cov imbalanced-learn-0.7.0/requirements.txt000066400000000000000000000000701366766276300202030ustar00rootroot00000000000000numpy>=1.11 scipy>=0.17 scikit-learn>=0.22 joblib>=0.11 imbalanced-learn-0.7.0/setup.cfg000066400000000000000000000012171366766276300165440ustar00rootroot00000000000000[bumpversion] current_version = 0.7.0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = {major}.{minor}.{patch}.{release}{dev} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = gamma values = dev gamma [bumpversion:part:dev] [bumpversion:file:imblearn/_version.py] [aliases] test = pytest [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS addopts = --ignore build_tools --ignore benchmarks --ignore doc --ignore examples --ignore maint_tools --doctest-modules -rs filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning imbalanced-learn-0.7.0/setup.py000077500000000000000000000043541366766276300164450ustar00rootroot00000000000000#! /usr/bin/env python """Toolbox for imbalanced dataset in machine learning.""" import codecs import os from setuptools import find_packages, setup # get __version__ from _version.py ver_file = os.path.join('imblearn', '_version.py') with open(ver_file) as f: exec(f.read()) DISTNAME = 'imbalanced-learn' DESCRIPTION = 'Toolbox for imbalanced dataset in machine learning.' with codecs.open('README.rst', encoding='utf-8-sig') as f: LONG_DESCRIPTION = f.read() MAINTAINER = 'G. Lemaitre, C. Aridas' MAINTAINER_EMAIL = 'g.lemaitre58@gmail.com, ichkoar@gmail.com' URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' VERSION = __version__ CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', 'License :: OSI Approved', 'Programming Language :: C', 'Programming Language :: Python', 'Topic :: Software Development', 'Topic :: Scientific/Engineering', 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX', 'Operating System :: Unix', 'Operating System :: MacOS', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8'] INSTALL_REQUIRES = [ 'numpy>=1.13.3', 'scipy>=0.19.1', 'scikit-learn>=0.23', 'joblib>=0.11' ] EXTRAS_REQUIRE = { 'tests': [ 'pytest', 'pytest-cov'], 'docs': [ 'sphinx', 'sphinx-gallery', 'sphinx_rtd_theme', 'sphinxcontrib-bibtex', 'numpydoc', 'matplotlib', 'pandas', ] } setup(name=DISTNAME, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, description=DESCRIPTION, license=LICENSE, url=URL, version=VERSION, download_url=DOWNLOAD_URL, long_description=LONG_DESCRIPTION, zip_safe=False, # the package can run out of an .egg file classifiers=CLASSIFIERS, packages=find_packages(), install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_REQUIRE)