pax_global_header00006660000000000000000000000064147454711360014526gustar00rootroot0000000000000052 comment=9d64ea83aa765b882529976966dc03dcba12be58 nvitop-1.4.2/000077500000000000000000000000001474547113600130515ustar00rootroot00000000000000nvitop-1.4.2/.dockerignore000077700000000000000000000000001474547113600175102.gitignoreustar00rootroot00000000000000nvitop-1.4.2/.editorconfig000066400000000000000000000007141474547113600155300ustar00rootroot00000000000000# https://editorconfig.org/ root = true [*] charset = utf-8 end_of_line = lf indent_style = space indent_size = 4 trim_trailing_whitespace = true insert_final_newline = true [*.py] indent_size = 4 src_paths=nvitop [*.{yaml,yml,json,xml}] indent_size = 2 [*.md] indent_size = 2 x-soft-wrap-text = true [*.rst] indent_size = 4 x-soft-wrap-text = true [Makefile] indent_style = tab [*.sh] indent_style = tab [*.bat] end_of_line = crlf indent_style = tab nvitop-1.4.2/.flake8000066400000000000000000000016451474547113600142320ustar00rootroot00000000000000[flake8] max-line-length = 120 max-doc-length = 100 select = B,C,E,F,W,Y,SIM ignore = # E203: whitespace before ':' # E241: whitespace after ':' # E704: multiple statements on one line (def) # W503: line break before binary operator # W504: line break after binary operator # format by black E203,E241,E704,W503,W504, # E501: line too long # W505: doc line too long # too long docstring due to long example blocks E501,W505, # SIM105: Use 'contextlib.suppress(...)' # prefer try-except block SIM105, per-file-ignores = # F401: module imported but unused # intentionally unused imports __init__.py: F401 nvitop/api/host.py: F401 nvitop/api/libnvml.py: F401 # SIM113: use enumerate # false positive nvitop/tui/screens/main/process.py: SIM113 exclude = .git, .vscode, venv, __pycache__, docs/source/conf.py, build, dist nvitop-1.4.2/.gitattributes000066400000000000000000000000351474547113600157420ustar00rootroot00000000000000* text eol=lf *.bat eol=crlf nvitop-1.4.2/.github/000077500000000000000000000000001474547113600144115ustar00rootroot00000000000000nvitop-1.4.2/.github/ISSUE_TEMPLATE/000077500000000000000000000000001474547113600165745ustar00rootroot00000000000000nvitop-1.4.2/.github/ISSUE_TEMPLATE/bug-report.yaml000066400000000000000000000110021474547113600215400ustar00rootroot00000000000000name: 🐛 Bug Report description: File an issue about a bug. title: "[BUG] " labels: [bug] assignees: [XuehaiPan] body: - type: markdown attributes: value: | Please do your best to make the issue as easy to act on as possible, and only submit here if there is clearly a problem with `nvitop`. You may try the latest version of `nvitop` in an isolated environment with the following commands first: ```bash pip3 install --upgrade pipx PYTHONFAULTHANDLER=1 pipx run --spec git+https://github.com/XuehaiPan/nvitop.git nvitop ``` - type: checkboxes id: steps attributes: label: Required prerequisites description: Make sure you've completed the following steps before submitting your issue -- thank you! options: - label: I have read the documentation . required: true - label: I have searched the [Issue Tracker](https://github.com/XuehaiPan/nvitop/issues) that this hasn't already been reported. (comment there if it has.) required: true - label: I have tried the latest version of nvitop in a new isolated virtual environment. required: false - type: input id: version attributes: label: What version of nvitop are you using? description: Run command `nvitop --version` or `python3 -m nvitop --version` in your shell and paste the output here. placeholder: E.g., 1.0.0 validations: required: true - type: input id: system attributes: label: Operating system and version placeholder: E.g., Ubuntu 20.04 LTS / Windows 10 Build 19043.1110 validations: required: true - type: input id: driver-version attributes: label: NVIDIA driver version placeholder: E.g., 470.161.03 validations: required: true - type: textarea id: nvidia-smi attributes: label: NVIDIA-SMI description: Run command `nvidia-smi` in your shell and paste the output here. render: text - type: textarea id: environment attributes: label: Python environment description: | Describe the characteristic of your environment: - Describe how the library was installed (pip, conda, source, ...) - Python version - Versions of any other relevant libraries Run the following command (copy all of them) in your shell and paste the results in the textbox below. ```bash python3 -m pip freeze | python3 -c 'import sys; print(sys.version, sys.platform); print("".join(filter(lambda s: any(word in s.lower() for word in ("nvi", "cuda", "nvml", "gpu")), sys.stdin)))' ``` validations: required: true - type: textarea id: description attributes: label: Problem description description: >- Provide a short description, state the expected behavior and what actually happens. Include relevant information like what version of nvitop you are using, what system you are on, and any useful commands / output. placeholder: Describe what the problem is. validations: required: true - type: textarea id: code attributes: label: Steps to Reproduce description: >- The code should be minimal, have minimal external dependencies, and isolate the functions that cause breakage. Submit matched and complete snippets that can be easily run to diagnose the issue. value: | The Python snippets (if any): ```python ``` Command lines: ```bash ``` validations: required: true - type: textarea id: traceback attributes: label: Traceback description: Put the Python traceback information here. placeholder: | Traceback (most recent call last): File ... render: pytb - type: textarea id: logs attributes: label: Logs description: Run nvitop with `PYTHONFAULTHANDLER=1 LOGLEVEL=DEBUG nvitop` and paste the output here. render: text - type: textarea id: expected attributes: label: Expected behavior description: Provide a clear and concise description of what you expected to happen. - type: textarea id: additional-context attributes: label: Additional context description: >- Add any other context about the problem here. Screenshots may also be helpful. If you know or suspect the reason for this bug, paste the code lines and suggest modifications. nvitop-1.4.2/.github/ISSUE_TEMPLATE/config.yaml000066400000000000000000000000341474547113600207220ustar00rootroot00000000000000blank_issues_enabled: false nvitop-1.4.2/.github/ISSUE_TEMPLATE/feature-request.yaml000066400000000000000000000041011474547113600225750ustar00rootroot00000000000000name: ✨ Feature Request description: Suggest an idea for this project. title: "[Feature Request] " labels: [enhancement] assignees: [XuehaiPan] body: - type: markdown attributes: value: | Please do your best to make the issue as easy to act on as possible, and only submit here if there is clearly a problem with `nvitop`. You may try the latest version of `nvitop` in an isolated environment with the following commands first: ```bash pip3 install --upgrade pipx PYTHONFAULTHANDLER=1 pipx run --spec git+https://github.com/XuehaiPan/nvitop.git nvitop ``` - type: checkboxes id: steps attributes: label: Required prerequisites description: Make sure you've completed the following steps before submitting your issue -- thank you! options: - label: I have searched the [Issue Tracker](https://github.com/XuehaiPan/nvitop/issues) that this hasn't already been reported. (comment there if it has.) required: true - label: I have tried the latest version of nvitop in a new isolated virtual environment. required: false - type: textarea id: motivation attributes: label: Motivation description: Outline the motivation for the proposal. value: | validations: required: true - type: textarea id: solution attributes: label: Solution description: Provide a clear and concise description of what you want to happen. - type: textarea id: alternatives attributes: label: Alternatives description: A clear and concise description of any alternative solutions or features you've considered. - type: textarea id: additional-context attributes: label: Additional context description: Add any other context about the problem here. Screenshots may also be helpful. nvitop-1.4.2/.github/ISSUE_TEMPLATE/questions.yaml000066400000000000000000000026621474547113600215200ustar00rootroot00000000000000name: 🤔 Questions / Help / Support description: Do you need support? title: "[Question] " labels: [question] assignees: [XuehaiPan] body: - type: markdown attributes: value: | Please do your best to make the issue as easy to act on as possible, and only submit here if there is clearly a problem with `nvitop`. You may try the latest version of `nvitop` in an isolated environment with the following commands first: ```bash pip3 install --upgrade pipx PYTHONFAULTHANDLER=1 pipx run --spec git+https://github.com/XuehaiPan/nvitop.git nvitop ``` - type: checkboxes id: steps attributes: label: Required prerequisites description: Make sure you've completed the following steps before submitting your issue -- thank you! options: - label: I have read the documentation . required: true - label: I have searched the [Issue Tracker](https://github.com/XuehaiPan/nvitop/issues) that this hasn't already been reported. (comment there if it has.) required: true - label: I have tried the latest version of nvitop in a new isolated virtual environment. required: false - type: textarea id: questions attributes: label: Questions description: Describe your questions with relevant resources such as snippets, links, images, etc. validations: required: true nvitop-1.4.2/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000022351474547113600202140ustar00rootroot00000000000000 #### Issue Type - Bug fix - Improvement/feature implementation - Breaking changes #### Runtime Environment - Operating system and version: [e.g. Ubuntu 20.04 LTS / Windows 10 Build 19043.1110] - Terminal emulator and version: [e.g. GNOME Terminal 3.36.2 / Windows Terminal 1.8.1521.0] - Python version: [e.g. `3.7.2` / `3.9.6`] - NVML version (driver version): [e.g. `460.84`] - `nvitop` version or commit: [e.g. `0.10.0` / `0.10.1.dev7+ga083321` / `main@75ae3c`] - `python-ml-py` version: [e.g. `11.450.51`] - Locale: [e.g. `C` / `C.UTF-8` / `en_US.UTF-8`] #### Description #### Motivation and Context #### Testing #### Images / Videos nvitop-1.4.2/.github/dependabot.yml000066400000000000000000000003621474547113600172420ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" day: "monday" time: "12:00" timezone: "Asia/Shanghai" commit-message: prefix: "deps(workflows)" nvitop-1.4.2/.github/workflows/000077500000000000000000000000001474547113600164465ustar00rootroot00000000000000nvitop-1.4.2/.github/workflows/build.yaml000066400000000000000000000140151474547113600204320ustar00rootroot00000000000000name: Build on: push: branches: - main # allow to trigger the workflow with tag push event pull_request: paths: - setup.py - setup.cfg - pyproject.toml - MANIFEST.in - nvitop/version.py - Dockerfile - .github/workflows/build.yaml release: types: - published # Allow to trigger the workflow manually workflow_dispatch: inputs: task: description: "Task type" type: choice options: - build-only - build-and-publish required: true concurrency: group: "${{ github.workflow }}-${{ github.ref }}" cancel-in-progress: ${{ github.event_name == 'pull_request' }} permissions: contents: read jobs: build: runs-on: ubuntu-latest if: github.repository == 'XuehaiPan/nvitop' timeout-minutes: 30 steps: - name: Checkout uses: actions/checkout@v4 with: submodules: "recursive" fetch-depth: 0 - name: Set up Python id: py uses: actions/setup-python@v5 with: python-version: "3.7 - 3.13" update-environment: true - name: Upgrade build dependencies run: python -m pip install --upgrade pip setuptools wheel build - name: Quick test run: | python -m venv venv && ( source venv/bin/activate && python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions && python -m pip install -r requirements.txt && python -m pip install -r nvitop-exporter/requirements.txt && python -m pre_commit install --install-hooks && python -m pre_commit run --all-files && python -c 'import nvitop' && python -m nvitop --version && python -m nvitop --help && python -m nvitop.select --version && python -m nvitop.select --help && ( cd nvitop-exporter && python -c 'import nvitop_exporter' && python -m nvitop_exporter --version && python -m nvitop_exporter --help ) ) - name: Test docker build run: | docker build --tag nvitop:latest . docker run --rm nvitop:latest --help - name: Set __release__ if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' run: | sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py - name: Print version run: | python setup.py --version python nvitop-exporter/setup.py --version - name: Build sdist and wheels run: | python -m build --outdir dist . python -m build --outdir dist nvitop-exporter - name: List built sdist and wheels run: ls -lh dist/ - name: Upload artifact uses: actions/upload-artifact@v4 with: name: artifact path: dist/* if-no-files-found: error publish: runs-on: ubuntu-latest needs: [build] if: | github.repository == 'XuehaiPan/nvitop' && github.event_name != 'pull_request' && (github.event_name != 'workflow_dispatch' || github.event.inputs.task == 'build-and-publish') && (github.event_name != 'push' || startsWith(github.ref, 'refs/tags/')) timeout-minutes: 15 steps: - name: Checkout uses: actions/checkout@v4 with: submodules: "recursive" fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 if: startsWith(github.ref, 'refs/tags/') with: python-version: "3.7 - 3.13" update-environment: true - name: Upgrade pip run: | python -m pip install --upgrade pip setuptools - name: Set __release__ if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' run: | sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py - name: Print version run: | python setup.py --version python nvitop-exporter/setup.py --version - name: Check consistency between the package version and release tag if: startsWith(github.ref, 'refs/tags/') run: | RELEASE_TAG="${GITHUB_REF#refs/*/}" PACKAGE_VER="v$(python setup.py --version)" if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})" exit 1 fi PACKAGE_VER="v$(python nvitop-exporter/setup.py --version)" if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})" exit 1 fi - name: Download built sdist and wheels uses: actions/download-artifact@v4 with: # unpacks default artifact into dist/ # if `name: artifact` is omitted, the action will create extra parent dir name: artifact path: dist - name: Publish to TestPyPI if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.TESTPYPI_UPLOAD_TOKEN }} repository-url: https://test.pypi.org/legacy/ verbose: true print-hash: true skip-existing: true - name: Publish to PyPI if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.PYPI_UPLOAD_TOKEN }} verbose: true print-hash: true skip-existing: true nvitop-1.4.2/.github/workflows/lint.yaml000066400000000000000000000033121474547113600202770ustar00rootroot00000000000000name: Lint on: push: branches: - main pull_request: concurrency: group: "${{ github.workflow }}-${{ github.ref }}" cancel-in-progress: ${{ github.event_name == 'pull_request' }} permissions: contents: read jobs: lint: runs-on: ubuntu-latest timeout-minutes: 30 steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 1 - name: Set up Python id: py uses: actions/setup-python@v5 with: python-version: "3.7 - 3.13" update-environment: true - name: Upgrade pip run: | python -m pip install --upgrade pip setuptools - name: Install dependencies run: | python -m pip install -r requirements.txt - name: Import tests run: | python -c 'import nvitop' python -m nvitop --version python -m nvitop --help python -m nvitop.select --version python -m nvitop.select --help - name: Install dependencies for nvitop-exporter run: | python -m pip install -r nvitop-exporter/requirements.txt - name: Import tests for nvitop-exporter run: | ( cd nvitop-exporter && python -c 'import nvitop_exporter' && python -m nvitop_exporter --version && python -m nvitop_exporter --help ) - name: Install linters run: | python -m pip install --upgrade pre-commit pylint[spelling] mypy typing-extensions - name: pre-commit run: | python -m pre_commit --version python -m pre_commit install --install-hooks python -m pre_commit run --all-files nvitop-1.4.2/.gitignore000066400000000000000000000034671474547113600150530ustar00rootroot00000000000000# NVML library pynvml.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ /lib/ /lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ docs/source/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ nvitop-1.4.2/.pre-commit-config.yaml000066400000000000000000000041671474547113600173420ustar00rootroot00000000000000# See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks ci: skip: [pylint] autofix_prs: true autofix_commit_msg: "fix: [pre-commit.ci] auto fixes [...]" autoupdate_commit_msg: "chore(pre-commit): [pre-commit.ci] autoupdate" default_stages: [pre-commit, pre-push, manual] repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: check-symlinks - id: destroyed-symlinks - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-toml - id: check-ast - id: check-added-large-files - id: check-merge-conflict - id: check-executables-have-shebangs - id: check-shebang-scripts-are-executable - id: detect-private-key - id: debug-statements - id: double-quote-string-fixer - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/psf/black rev: 24.10.0 hooks: - id: black - repo: https://github.com/asottile/pyupgrade rev: v3.19.1 hooks: - id: pyupgrade args: [--py37-plus] # sync with requires-python - repo: https://github.com/pycqa/flake8 rev: 7.1.1 hooks: - id: flake8 additional_dependencies: - flake8-bugbear - flake8-comprehensions - flake8-docstrings - flake8-pyi - flake8-simplify exclude: | (?x)( ^docs/source/conf.py$ ) - repo: https://github.com/codespell-project/codespell rev: v2.4.0 hooks: - id: codespell additional_dependencies: [".[toml]"] - repo: local hooks: - id: pylint name: pylint entry: pylint language: system types: [python] require_serial: true - repo: local hooks: - id: mypy name: mypy entry: mypy language: system types_or: [python, pyi] require_serial: true exclude: | (?x)( ^nvitop-exporter/setup.py$ ) nvitop-1.4.2/.readthedocs.yaml000066400000000000000000000012261474547113600163010ustar00rootroot00000000000000# .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-lts-latest tools: python: "3.8" jobs: post_install: - bash docs/source/fix-psutil-docstring.sh # Build documentation in the docs/ directory with Sphinx sphinx: builder: html configuration: docs/source/conf.py fail_on_warning: true # Optionally declare the Python requirements required to build your docs python: install: - requirements: requirements.txt - requirements: docs/requirements.txt nvitop-1.4.2/CHANGELOG.md000066400000000000000000000144031474547113600146640ustar00rootroot00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ------ ## [Unreleased] ### Added - ### Changed - ### Fixed - ### Removed - ------ ## [1.4.2] - 2025-01-27 ### Removed - Vendor third-party dependency `termcolor` by [@XuehaiPan](https://github.com/XuehaiPan) in [#148](https://github.com/XuehaiPan/nvitop/pull/148). - Remove third-party dependency `cachetools` by [@XuehaiPan](https://github.com/XuehaiPan) in [#147](https://github.com/XuehaiPan/nvitop/pull/147). ------ ## [1.4.1] - 2025-01-13 ### Fixed - Fix passing invalid device handle (e.g., GPU is lost) to NVML functions by [@XuehaiPan](https://github.com/XuehaiPan) in [#146](https://github.com/XuehaiPan/nvitop/pull/146). - Fix CUDA device selection tool `nvisel` by [@XuehaiPan](https://github.com/XuehaiPan). ------ ## [1.4.0] - 2024-12-29 ### Added - Add Grafana dashboard for `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#138](https://github.com/XuehaiPan/nvitop/pull/138). - Handle exceptions for function `getpass.getuser()` by [@XuehaiPan](https://github.com/XuehaiPan) in [#130](https://github.com/XuehaiPan/nvitop/pull/130). Issued by [@landgraf](https://github.com/landgraf). ### Changed - Refactor setup scripts by [@XuehaiPan](https://github.com/XuehaiPan). ### Fixed - Fix documentation for the `ResourceMetricCollector.clear()` method by [@MyGodItsFull0fStars](https://github.com/MyGodItsFull0fStars) in [#132](https://github.com/XuehaiPan/nvitop/pull/132). - Gracefully ignore UTF-8 decoding errors by [@XuehaiPan](https://github.com/XuehaiPan). ------ ## [1.3.2] - 2023-10-17 ### Added - Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114). - Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107). ------ ## [1.3.1] - 2023-10-05 ### Added - Add Python 3.12 classifiers by [@XuehaiPan](https://github.com/XuehaiPan) in [#101](https://github.com/XuehaiPan/nvitop/pull/101). ### Fixed - Fix `libcuda.cuDeviceGetUuid()` when the UUID contains `0x00` by [@XuehaiPan](https://github.com/XuehaiPan) in [#100](https://github.com/XuehaiPan/nvitop/pull/100). ------ ## [1.3.0] - 2023-08-27 ### Added - Add Prometheus exporter by [@XuehaiPan](https://github.com/XuehaiPan) in [#92](https://github.com/XuehaiPan/nvitop/pull/92). - Add device APIs to query PCIe and NVLink throughput by [@XuehaiPan](https://github.com/XuehaiPan) in [#87](https://github.com/XuehaiPan/nvitop/pull/87). ### Changed - Use recent timestamp for GPU process utilization query for more accurate per-process GPU usage by [@XuehaiPan](https://github.com/XuehaiPan) in [#85](https://github.com/XuehaiPan/nvitop/pull/85). We extend our heartfelt gratitude to [@2581543189](https://github.com/2581543189) for their invaluable assistance. Their timely comments and comprehensive feedback have greatly contributed to the improvement of this project. ### Fixed - Fix upstream changes for process info v3 APIs on 535.104.05 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#94](https://github.com/XuehaiPan/nvitop/pull/94). - Fix removal for process info v3 APIs on the upstream 535.98 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#89](https://github.com/XuehaiPan/nvitop/pull/89). ------ ## [1.2.0] - 2023-07-24 ### Added - Include last snapshot metrics in the log results for `ResourceMetricCollector` by [@XuehaiPan](https://github.com/XuehaiPan) in [#80](https://github.com/XuehaiPan/nvitop/pull/80). - Add `mypy` integration and update type annotations by [@XuehaiPan](https://github.com/XuehaiPan) in [#73](https://github.com/XuehaiPan/nvitop/pull/73). ### Fixed - Fix process info support for NVIDIA R535 driver (CUDA 12.2+) by [@XuehaiPan](https://github.com/XuehaiPan) in [#79](https://github.com/XuehaiPan/nvitop/pull/79). - Fix inappropriate exception catching in function `libcuda.cuDeviceGetUuid` by [@XuehaiPan](https://github.com/XuehaiPan). ------ ## [1.1.2] - 2023-04-11 ### Fixed - Further isolate the `CUDA_VISIBLE_DEVICES` parser in a subprocess by [@XuehaiPan](https://github.com/XuehaiPan) in [#70](https://github.com/XuehaiPan/nvitop/pull/70). ------ ## [1.1.1] - 2023-04-07 ### Fixed - Fix MIG device support by [@XuehaiPan](https://github.com/XuehaiPan). ------ ## [1.1.0] - 2023-04-07 ### Added - Support float number as snapshot interval that >= 0.25s by [@XuehaiPan](https://github.com/XuehaiPan) in [#67](https://github.com/XuehaiPan/nvitop/pull/67). - Show more host metrics (e.g., used virtual memory, uptime) in CLI by [@XuehaiPan](https://github.com/XuehaiPan) in [#59](https://github.com/XuehaiPan/nvitop/pull/59). ### Changed - Move `TTLCache` usage to CLI-only by [@XuehaiPan](https://github.com/XuehaiPan) in [#66](https://github.com/XuehaiPan/nvitop/pull/66). ### Fixed - Respect `FORCE_COLOR` and `NO_COLOR` environment variables by [@XuehaiPan](https://github.com/XuehaiPan). ### Removed - Drop Python 3.6 support by [@XuehaiPan](https://github.com/XuehaiPan) in [#56](https://github.com/XuehaiPan/nvitop/pull/56). ------ ## [1.0.0] - 2023-02-01 ### Added - The first stable release of `nvitop` by [@XuehaiPan](https://github.com/XuehaiPan). ------ [Unreleased]: https://github.com/XuehaiPan/nvitop/compare/v1.4.2...HEAD [1.4.2]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.4.2 [1.4.1]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.4.1 [1.4.0]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.4.0 [1.3.2]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.3.2 [1.3.1]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.3.1 [1.3.0]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.3.0 [1.2.0]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.2.0 [1.1.2]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.1.2 [1.1.1]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.1.1 [1.1.0]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.1.0 [1.0.0]: https://github.com/XuehaiPan/nvitop/releases/tag/v1.0.0 nvitop-1.4.2/CODE_OF_CONDUCT.md000066400000000000000000000125541474547113600156570ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socioeconomic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at XuehaiPan@pku.edu.cn. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations]. [homepage]: https://www.contributor-covenant.org [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html [Mozilla CoC]: https://github.com/mozilla/diversity [FAQ]: https://www.contributor-covenant.org/faq [translations]: https://www.contributor-covenant.org/translations nvitop-1.4.2/COPYING000066400000000000000000001045151474547113600141120ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . nvitop-1.4.2/Dockerfile000066400000000000000000000024241474547113600150450ustar00rootroot00000000000000ARG basetag="450-signed-ubuntu22.04" # Ubuntu only FROM nvcr.io/nvidia/driver:"${basetag}" ENV NVIDIA_DISABLE_REQUIRE=true ENV DEBIAN_FRONTEND=noninteractive # Update APT sources RUN . /etc/os-release && [ "${NAME}" = "Ubuntu" ] && \ echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu ${UBUNTU_CODENAME} main universe" > /etc/apt/sources.list && \ echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu ${UBUNTU_CODENAME}-updates main universe" >> /etc/apt/sources.list && \ echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu ${UBUNTU_CODENAME}-security main universe" >> /etc/apt/sources.list # Install Python 3 RUN apt-get update && \ apt-get install --quiet --yes --no-install-recommends python3-dev python3-venv locales && \ rm -rf /var/lib/apt/lists/* # Setup locale ENV LC_ALL=C.UTF-8 RUN update-locale LC_ALL="C.UTF-8" # Setup environment RUN python3 -m venv /venv && \ . /venv/bin/activate && \ python3 -m pip install --upgrade pip setuptools && \ rm -rf /root/.cache && \ echo && echo && echo "source /venv/bin/activate" >> /root/.bashrc ENV SHELL /bin/bash # Install nvitop COPY . /nvitop WORKDIR /nvitop RUN . /venv/bin/activate && \ python3 -m pip install . && \ rm -rf /root/.cache # Entrypoint ENTRYPOINT [ "/venv/bin/python3", "-m", "nvitop" ] nvitop-1.4.2/LICENSE000066400000000000000000000261501474547113600140620ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021-2025 Xuehai Pan. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. nvitop-1.4.2/MANIFEST.in000066400000000000000000000001421474547113600146040ustar00rootroot00000000000000recursive-include nvitop LICENSE recursive-include nvitop COPYING include LICENSE include COPYING nvitop-1.4.2/README.md000066400000000000000000002234121474547113600143340ustar00rootroot00000000000000# nvitop ![Python 3.7+](https://img.shields.io/badge/Python-3.7%2B-brightgreen) [![PyPI](https://img.shields.io/pypi/v/nvitop?label=pypi&logo=pypi)](https://pypi.org/project/nvitop) [![conda-forge](https://img.shields.io/conda/vn/conda-forge/nvitop?label=conda&logo=condaforge)](https://anaconda.org/conda-forge/nvitop) [![Documentation Status](https://img.shields.io/readthedocs/nvitop?label=docs&logo=readthedocs)](https://nvitop.readthedocs.io) [![Downloads](https://static.pepy.tech/personalized-badge/nvitop?period=total&left_color=grey&right_color=blue&left_text=downloads)](https://pepy.tech/project/nvitop) [![GitHub Repo Stars](https://img.shields.io/github/stars/XuehaiPan/nvitop?label=stars&logo=github&color=brightgreen)](https://github.com/XuehaiPan/nvitop/stargazers) [![License](https://img.shields.io/github/license/XuehaiPan/nvitop?label=license&logo=)](#license) An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management. The full API references host at .

Monitor
Monitor mode of nvitop.
(TERM: GNOME Terminal / OS: Ubuntu 16.04 LTS (over SSH) / Locale: en_US.UTF-8)

Grafana Dashboard
A Grafana dashboard built on top of nvitop-exporter.

### Table of Contents - [Features](#features) - [Requirements](#requirements) - [Installation](#installation) - [Usage](#usage) - [Device and Process Status](#device-and-process-status) - [Resource Monitor](#resource-monitor) - [For Docker Users](#for-docker-users) - [For SSH Users](#for-ssh-users) - [Command Line Options and Environment Variables](#command-line-options-and-environment-variables) - [Keybindings for Monitor Mode](#keybindings-for-monitor-mode) - [CUDA Visible Devices Selection Tool](#cuda-visible-devices-selection-tool) - [Callback Functions for Machine Learning Frameworks](#callback-functions-for-machine-learning-frameworks) - [Callback for TensorFlow (Keras)](#callback-for-tensorflow-keras) - [Callback for PyTorch Lightning](#callback-for-pytorch-lightning) - [TensorBoard Integration](#tensorboard-integration) - [More than a Monitor](#more-than-a-monitor) - [Quick Start](#quick-start) - [Status Snapshot](#status-snapshot) - [Resource Metric Collector](#resource-metric-collector) - [Low-level APIs](#low-level-apis) - [Device](#device) - [Process](#process) - [Host (inherited from psutil)](#host-inherited-from-psutil) - [Screenshots](#screenshots) - [Changelog](#changelog) - [License](#license) - [Copyright Notice](#copyright-notice) ------ `nvitop` is an interactive NVIDIA device and process monitoring tool. It has a colorful and informative interface that continuously updates the status of the devices and processes. As a resource monitor, it includes many features and options, such as tree-view, environment variable viewing, process filtering, process metrics monitoring, etc. Beyond that, the package also ships a [CUDA device selection tool `nvisel`](#cuda-visible-devices-selection-tool) for deep learning researchers. It also provides handy APIs that allow developers to write their own monitoring tools. Please refer to section [More than a Monitor](#more-than-a-monitor) and the full API references at for more information.

Filter
Process filtering and a more colorful interface.

Comparison
Compare to nvidia-smi.

------ ## Features - **Informative and fancy output**: show more information than `nvidia-smi` with colorized fancy box drawing. - **Monitor mode**: can run as a resource monitor, rather than print the results only once. - bar charts and history graphs - process sorting - process filtering - send signals to processes with a keystroke - tree-view screen for GPU processes and their parent processes - environment variable screen - help screen - mouse support - **Interactive**: responsive for user input (from keyboard and/or mouse) in monitor mode. (vs. [gpustat](https://github.com/wookayin/gpustat) & [py3nvml](https://github.com/fbcotter/py3nvml)) - **Efficient**: - query device status using [*NVML Python bindings*](https://pypi.org/project/nvidia-ml-py) directly, instead of parsing the output of `nvidia-smi`. (vs. [nvidia-htop](https://github.com/peci1/nvidia-htop)) - support sparse query and cache results with `TTLCache` from [cachetools](https://github.com/tkem/cachetools). (vs. [gpustat](https://github.com/wookayin/gpustat)) - display information using the `curses` library rather than `print` with ANSI escape codes. (vs. [py3nvml](https://github.com/fbcotter/py3nvml)) - asynchronously gather information using multi-threading and correspond to user input much faster. (vs. [nvtop](https://github.com/Syllo/nvtop)) - **Portable**: work on both Linux and Windows. - get host process information using the cross-platform library [psutil](https://github.com/giampaolo/psutil) instead of calling `ps -p ` in a subprocess. (vs. [nvidia-htop](https://github.com/peci1/nvidia-htop) & [py3nvml](https://github.com/fbcotter/py3nvml)) - written in pure Python, easy to install with `pip`. (vs. [nvtop](https://github.com/Syllo/nvtop)) - **Integrable**: easy to integrate into other applications, more than monitoring. (vs. [nvidia-htop](https://github.com/peci1/nvidia-htop) & [nvtop](https://github.com/Syllo/nvtop))

Windows
nvitop supports Windows!
(SHELL: PowerShell / TERM: Windows Terminal / OS: Windows 10 / Locale: en-US)

------ ## Requirements - Python 3.7+ - NVIDIA Management Library (NVML) - nvidia-ml-py - psutil - curses[*](#curses) (with `libncursesw`) **NOTE:** The [NVIDIA Management Library (*NVML*)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based programmatic interface for monitoring and managing various states. The runtime version of the NVML library ships with the NVIDIA display driver (available at [Download Drivers | NVIDIA](https://www.nvidia.com/Download/index.aspx)), or can be downloaded as part of the NVIDIA CUDA Toolkit (available at [CUDA Toolkit | NVIDIA Developer](https://developer.nvidia.com/cuda-downloads)). The lists of OS platforms and NVIDIA-GPUs supported by the NVML library can be found in the [NVML API Reference](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html). This repository contains a Bash script to install/upgrade the NVIDIA drivers for Ubuntu Linux. For example: ```bash git clone --depth=1 https://github.com/XuehaiPan/nvitop.git && cd nvitop # Change to tty3 console (required for desktop users with GUI (tty2)) # Optional for SSH users sudo chvt 3 # or use keyboard shortcut: Ctrl-LeftAlt-F3 bash install-nvidia-driver.sh --package=nvidia-driver-470 # install the R470 driver from ppa:graphics-drivers bash install-nvidia-driver.sh --latest # install the latest driver from ppa:graphics-drivers ```

install-nvidia-driver
NVIDIA driver installer for Ubuntu Linux.

Run `bash install-nvidia-driver.sh --help` for more information. * The `curses` library is a built-in module of Python on Unix-like systems, and it is supported by a third-party package called `windows-curses` on Windows using PDCurses. Inconsistent behavior of `nvitop` may occur on different terminal emulators on Windows, such as missing mouse support. ------ ## Installation **It is highly recommended to install `nvitop` in an isolated virtual environment.** Simple installation and run via [`pipx`](https://pypa.github.io/pipx) or [`uvx`](https://docs.astral.sh/uv/guides/tools) (a.k.a. `uv tool run`): ```bash pipx run nvitop # or uvx nvitop ``` You can also set this command as an alias in your shell startup file, e.g.: ```bash # For Bash echo 'alias nvitop="pipx run nvitop"' >> ~/.bashrc # For Zsh echo 'alias nvitop="pipx run nvitop"' >> ~/.zshrc # For Fish mkdir -p ~/.config/fish echo 'alias nvitop="pipx run nvitop"' >> ~/.config/fish/config.fish # For PowerShell New-Item -Path (Split-Path -Parent -Path $PROFILE.CurrentUserAllHosts) -ItemType Directory -Force 'Function nvitop { pipx run nvitop @Args }' >> $PROFILE.CurrentUserAllHosts ``` or ```bash # For Bash echo 'alias nvitop="uvx nvitop"' >> ~/.bashrc # For Zsh echo 'alias nvitop="uvx nvitop"' >> ~/.zshrc # For Fish mkdir -p ~/.config/fish echo 'alias nvitop="uvx nvitop"' >> ~/.config/fish/config.fish # For PowerShell New-Item -Path (Split-Path -Parent -Path $PROFILE.CurrentUserAllHosts) -ItemType Directory -Force 'Function nvitop { uvx nvitop @Args }' >> $PROFILE.CurrentUserAllHosts ``` Install from PyPI ([![PyPI](https://img.shields.io/pypi/v/nvitop?label=pypi&logo=pypi)](https://pypi.org/project/nvitop)): ```bash pip3 install --upgrade nvitop ``` Install from conda-forge ([![conda-forge](https://img.shields.io/conda/v/conda-forge/nvitop?logo=condaforge)](https://anaconda.org/conda-forge/nvitop)): ```bash conda install -c conda-forge nvitop ``` Install the latest version from GitHub (![Commit Count](https://img.shields.io/github/commits-since/XuehaiPan/nvitop/v1.4.2)): ```bash pip3 install --upgrade pip setuptools pip3 install git+https://github.com/XuehaiPan/nvitop.git#egg=nvitop ``` Or, clone this repo and install manually: ```bash git clone --depth=1 https://github.com/XuehaiPan/nvitop.git cd nvitop pip3 install . ``` **NOTE:** If you encounter the *"nvitop: command not found"* error after installation, please check whether you have added the Python console script path (e.g., `"${HOME}/.local/bin"`) to your `PATH` environment variable. Alternatively, you can use `python3 -m nvitop`.

MIG Device Support
MIG Device Support.

------ ## Usage ### Device and Process Status Query the device and process status. The output is similar to `nvidia-smi`, but has been enriched and colorized. ```bash # Query the status of all devices $ nvitop -1 # or use `python3 -m nvitop -1` # Specify query devices (by integer indices) $ nvitop -1 -o 0 1 # only show and # Only show devices in `CUDA_VISIBLE_DEVICES` (by integer indices or UUID strings) $ nvitop -1 -ov # Only show GPU processes with the compute context (type: 'C' or 'C+G') $ nvitop -1 -c ``` When the `-1` switch is on, the result will be displayed **ONLY ONCE** (same as the default behavior of `nvidia-smi`). This is much faster and has lower resource usage. See [Command Line Options](#command-line-options-and-environment-variables) for more command options. There is also a CLI tool called `nvisel` that ships with the `nvitop` PyPI package. See [CUDA Visible Devices Selection Tool](#cuda-visible-devices-selection-tool) for more information. ### Resource Monitor Run as a resource monitor: ```bash # Monitor mode (when the display mode is omitted, `NVITOP_MONITOR_MODE` will be used) $ nvitop # or use `python3 -m nvitop` # Automatically configure the display mode according to the terminal size $ nvitop -m auto # shortcut: `a` key # Arbitrarily display as `full` mode $ nvitop -m full # shortcut: `f` key # Arbitrarily display as `compact` mode $ nvitop -m compact # shortcut: `c` key # Specify query devices (by integer indices) $ nvitop -o 0 1 # only show and # Only show devices in `CUDA_VISIBLE_DEVICES` (by integer indices or UUID strings) $ nvitop -ov # Only show GPU processes with the compute context (type: 'C' or 'C+G') $ nvitop -c # Use ASCII characters only $ nvitop -U # useful for terminals without Unicode support # For light terminals $ nvitop --light # For spectrum-like bar charts (requires the terminal supports 256-color) $ nvitop --colorful ``` You can configure the default monitor mode with the `NVITOP_MONITOR_MODE` environment variable (default `auto` if not set). See [Command Line Options and Environment Variables](#command-line-options-and-environment-variables) for more command options. In monitor mode, you can use Ctrl-c / T / K keys to interrupt / terminate / kill a process. And it's recommended to *terminate* or *kill* a process in the **tree-view screen** (shortcut: t). For normal users, `nvitop` will shallow other users' processes (in low-intensity colors). For **system administrators**, you can use `sudo nvitop` to terminate other users' processes. Also, to enter the process metrics screen, select a process and then press the Enter / Return key . `nvitop` dynamically displays the process metrics with live graphs.

Process Metrics Screen
Watch metrics for a specific process (shortcut: Enter / Return).

Press h for help or q to return to the terminal. See [Keybindings for Monitor Mode](#keybindings-for-monitor-mode) for more shortcuts.

Help Screen
nvitop comes with a help screen (shortcut: h).

#### For Docker Users Build and run the Docker image using [nvidia-docker](https://github.com/NVIDIA/nvidia-docker): ```bash git clone --depth=1 https://github.com/XuehaiPan/nvitop.git && cd nvitop # clone this repo first docker build --tag nvitop:latest . # build the Docker image docker run -it --rm --runtime=nvidia --gpus=all --pid=host nvitop:latest # run the Docker container ``` The [`Dockerfile`](Dockerfile) has an optional build argument `basetag` (default: `450-signed-ubuntu22.04`) for the tag of image [`nvcr.io/nvidia/driver`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver/tags). **NOTE:** Don't forget to add the `--pid=host` option when running the container. #### For SSH Users Run `nvitop` directly on the SSH session instead of a login shell: ```bash ssh user@host -t nvitop # installed by `sudo pip3 install ...` ssh user@host -t '~/.local/bin/nvitop' # installed by `pip3 install --user ...` ``` **NOTE:** Users need to add the `-t` option to allocate a pseudo-terminal over the SSH session for monitor mode. #### Command Line Options and Environment Variables Type `nvitop --help` for more command options: ```text usage: nvitop [--help] [--version] [--once | --monitor [{auto,full,compact}]] [--interval SEC] [--ascii] [--colorful] [--force-color] [--light] [--gpu-util-thresh th1 th2] [--mem-util-thresh th1 th2] [--only INDEX [INDEX ...]] [--only-visible] [--compute] [--only-compute] [--graphics] [--only-graphics] [--user [USERNAME ...]] [--pid PID [PID ...]] An interactive NVIDIA-GPU process viewer. options: --help, -h Show this help message and exit. --version, -V Show nvitop's version number and exit. --once, -1 Report query data only once. --monitor [{auto,full,compact}], -m [{auto,full,compact}] Run as a resource monitor. Continuously report query data and handle user inputs. If the argument is omitted, the value from `NVITOP_MONITOR_MODE` will be used. (default fallback mode: auto) --interval SEC Process status update interval in seconds. (default: 2) --ascii, --no-unicode, -U Use ASCII characters only, which is useful for terminals without Unicode support. coloring: --colorful Use gradient colors to get spectrum-like bar charts. This option is only available when the terminal supports 256 colors. You may need to set environment variable `TERM="xterm-256color"`. Note that the terminal multiplexer, such as `tmux`, may override the `TREM` variable. --force-color Force colorize even when `stdout` is not a TTY terminal. --light Tweak visual results for light theme terminals in monitor mode. Set variable `NVITOP_MONITOR_MODE="light"` on light terminals for convenience. --gpu-util-thresh th1 th2 Thresholds of GPU utilization to determine the load intensity. Coloring rules: light < th1 % <= moderate < th2 % <= heavy. ( 1 <= th1 < th2 <= 99, defaults: 10 75 ) --mem-util-thresh th1 th2 Thresholds of GPU memory percent to determine the load intensity. Coloring rules: light < th1 % <= moderate < th2 % <= heavy. ( 1 <= th1 < th2 <= 99, defaults: 10 80 ) device filtering: --only INDEX [INDEX ...], -o INDEX [INDEX ...] Only show the specified devices, suppress option `--only-visible`. --only-visible, -ov Only show devices in the `CUDA_VISIBLE_DEVICES` environment variable. process filtering: --compute, -c Only show GPU processes with the compute context. (type: 'C' or 'C+G') --only-compute, -C Only show GPU processes exactly with the compute context. (type: 'C' only) --graphics, -g Only show GPU processes with the graphics context. (type: 'G' or 'C+G') --only-graphics, -G Only show GPU processes exactly with the graphics context. (type: 'G' only) --user [USERNAME ...], -u [USERNAME ...] Only show processes of the given users (or `$USER` for no argument). --pid PID [PID ...], -p PID [PID ...] Only show processes of the given PIDs. ``` `nvitop` can accept the following environment variables for monitor mode: | Name | Description | Valid Values | Default Value | | -------------------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------- | ----------------- | | `NVITOP_MONITOR_MODE` | The default display mode (a comma-separated string) | `auto` / `full` / `compact`
`plain` / `colorful`
`dark` / `light` | `auto,plain,dark` | | `NVITOP_GPU_UTILIZATION_THRESHOLDS` | Thresholds of GPU utilization | `10,75` , `1,99`, ... | `10,75` | | `NVITOP_MEMORY_UTILIZATION_THRESHOLDS` | Thresholds of GPU memory percent | `10,80` , `1,99`, ... | `10,80` | | `LOGLEVEL` | Log level for log messages | `DEBUG` , `INFO`, `WARNING`, ... | `WARNING` | For example: ```bash # Replace the following export statements if you are not using Bash / Zsh export NVITOP_MONITOR_MODE="full,light" # Full monitor mode with light terminal tweaks nvitop ``` For convenience, you can add these environment variables to your shell startup file, e.g.: ```bash # For Bash echo 'export NVITOP_MONITOR_MODE="full"' >> ~/.bashrc # For Zsh echo 'export NVITOP_MONITOR_MODE="full"' >> ~/.zshrc # For Fish echo 'set -gx NVITOP_MONITOR_MODE "full"' >> ~/.config/fish/config.fish # For PowerShell '$Env:NVITOP_MONITOR_MODE = "full"' >> $PROFILE.CurrentUserAllHosts ``` #### Keybindings for Monitor Mode | Key | Binding | | -------------------------------------------------------------------------: | :----------------------------------------------------------------------------------- | | `q` | Quit and return to the terminal. | | `h` / `?` | Go to the help screen. | | `a` / `f` / `c` | Change the display mode to *auto* / *full* / *compact*. | | `r` / `` / `` | Force refresh the window. | | | | | `` / ``
`` / ``
`` / ``
`` | Select and highlight a process. | | `` / ``
`` / ``
`` | Scroll the host information of processes. | | `` | Select the first process. | | `` | Select the last process. | | ``
`^` | Scroll left to the beginning of the process entry (i.e. beginning of line). | | ``
`$` | Scroll right to the end of the process entry (i.e. end of line). | | `` / ``
`` / ``
`[` / `]` | scroll entire screen (for large amounts of processes). | | | | | `` | Tag/untag current process. | | `` | Clear process selection. | | ``
`I` | Send `signal.SIGINT` to the selected process (interrupt). | | `T` | Send `signal.SIGTERM` to the selected process (terminate). | | `K` | Send `signal.SIGKILL` to the selected process (kill). | | | | | `e` | Show process environment. | | `t` | Toggle tree-view screen. | | `` | Show process metrics. | | | | | `,` / `.` | Select the sort column. | | `/` | Reverse the sort order. | | `on` (`oN`) | Sort processes in the natural order, i.e., in ascending (descending) order of `GPU`. | | `ou` (`oU`) | Sort processes by `USER` in ascending (descending) order. | | `op` (`oP`) | Sort processes by `PID` in descending (ascending) order. | | `og` (`oG`) | Sort processes by `GPU-MEM` in descending (ascending) order. | | `os` (`oS`) | Sort processes by `%SM` in descending (ascending) order. | | `oc` (`oC`) | Sort processes by `%CPU` in descending (ascending) order. | | `om` (`oM`) | Sort processes by `%MEM` in descending (ascending) order. | | `ot` (`oT`) | Sort processes by `TIME` in descending (ascending) order. | **HINT:** It's recommended to terminate or kill a process in the tree-view screen (shortcut: t). ------ ### CUDA Visible Devices Selection Tool Automatically select `CUDA_VISIBLE_DEVICES` from the given criteria. Example usage of the CLI tool: ```console # All devices but sorted $ nvisel # or use `python3 -m nvitop.select` 6,5,4,3,2,1,0,7,8 # A simple example to select 4 devices $ nvisel -n 4 # or use `python3 -m nvitop.select -n 4` 6,5,4,3 # Select available devices that satisfy the given constraints $ nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60 6,5,4 # Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel` $ export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)" CUDA_VISIBLE_DEVICES="6,5,4,3,2,1,0" # Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable $ export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)" CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02" # Pipe output to other shell utilities $ nvisel --newline -O uuid -C 6 -f 8GiB GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794 GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1 GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0 GPU-2428d171-8684-5b64-830c-435cd972ec4a GPU-6d2a57c9-7783-44bb-9f53-13f36282830a GPU-f8e5a624-2c7e-417c-e647-b764d26d4733 $ nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02" index, memory.free [MiB] 6, 11018 MiB index, memory.free [MiB] 5, 11018 MiB index, memory.free [MiB] 4, 11018 MiB index, memory.free [MiB] 3, 11018 MiB index, memory.free [MiB] 2, 11018 MiB index, memory.free [MiB] 1, 11018 MiB index, memory.free [MiB] 0, 11018 MiB # Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form) $ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S 5,6 $ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S -O uuid --newline GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1 GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794 ``` You can also integrate `nvisel` into your training script like this: ```python # Put this at the top of the Python script import os from nvitop import select_devices os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( select_devices(format='uuid', min_count=4, min_free_memory='8GiB') ) ``` Type `nvisel --help` for more command options: ```text usage: nvisel [--help] [--version] [--inherit [CUDA_VISIBLE_DEVICES]] [--account-as-free [USERNAME ...]] [--min-count N] [--max-count N] [--count N] [--min-free-memory SIZE] [--min-total-memory SIZE] [--max-gpu-utilization RATE] [--max-memory-utilization RATE] [--tolerance TOL] [--format FORMAT] [--sep SEP | --newline | --null] [--no-sort] CUDA visible devices selection tool. options: --help, -h Show this help message and exit. --version, -V Show nvisel's version number and exit. constraints: --inherit [CUDA_VISIBLE_DEVICES], -i [CUDA_VISIBLE_DEVICES] Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the value from the environment. This means selecting a subset of the currently CUDA-visible devices. --account-as-free [USERNAME ...] Account the used GPU memory of the given users as free memory. If this option is specified but without argument, `$USER` will be used. --min-count N, -c N Minimum number of devices to select. (default: 0) The tool will fail (exit non-zero) if the requested resource is not available. --max-count N, -C N Maximum number of devices to select. (default: all devices) --count N, -n N Overriding both `--min-count N` and `--max-count N`. --min-free-memory SIZE, -f SIZE Minimum free memory of devices to select. (example value: 4GiB) If this constraint is given, check against all devices. --min-total-memory SIZE, -t SIZE Minimum total memory of devices to select. (example value: 10GiB) If this constraint is given, check against all devices. --max-gpu-utilization RATE, -G RATE Maximum GPU utilization rate of devices to select. (example value: 30) If this constraint is given, check against all devices. --max-memory-utilization RATE, -M RATE Maximum memory bandwidth utilization rate of devices to select. (example value: 50) If this constraint is given, check against all devices. --tolerance TOL, --tol TOL The constraints tolerance (in percentage). (default: 0, i.e., strict) This option can loose the constraints if the requested resource is not available. For example, set `--tolerance=20` will accept a device with only 4GiB of free memory when set `--min-free-memory=5GiB`. formatting: --format FORMAT, -O FORMAT The output format of the selected device identifiers. (default: index) If any MIG device found, the output format will be fallback to `uuid`. --sep SEP, --separator SEP, -s SEP Separator for the output. (default: ',') --newline Use newline character as separator for the output, equivalent to `--sep=$'\n'`. --null, -0 Use null character ('\x00') as separator for the output. This option corresponds to the `-0` option of `xargs`. --no-sort, -S Do not sort the device by memory usage and GPU utilization. ``` ------ ### Callback Functions for Machine Learning Frameworks `nvitop` provides two builtin callbacks for [TensorFlow (Keras)](https://www.tensorflow.org) and [PyTorch Lightning](https://pytorchlightning.ai). #### Callback for [TensorFlow (Keras)](https://www.tensorflow.org) ```python from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from tensorflow.python.keras.callbacks import TensorBoard from nvitop.callbacks.keras import GpuStatsLogger gpus = ['/gpu:0', '/gpu:1'] # or `gpus = [0, 1]` or `gpus = 2` model = Xception(weights=None, ..) model = multi_gpu_model(model, gpus) # optional model.compile(..) tb_callback = TensorBoard(log_dir='./logs') # or `keras.callbacks.CSVLogger` gpu_stats = GpuStatsLogger(gpus) model.fit(.., callbacks=[gpu_stats, tb_callback]) ``` **NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback. #### Callback for [PyTorch Lightning](https://lightning.ai) ```python from lightning.pytorch import Trainer from nvitop.callbacks.lightning import GpuStatsLogger gpu_stats = GpuStatsLogger() trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats]) ``` **NOTE:** Users should assign a logger to the trainer. #### [TensorBoard](https://github.com/tensorflow/tensorboard) Integration Please refer to [Resource Metric Collector](#resource-metric-collector) for an example. ------ ### More than a Monitor `nvitop` can be easily integrated into other applications. You can use `nvitop` to make your own monitoring tools. The full API references host at . #### Quick Start A minimal script to monitor the GPU devices based on APIs from `nvitop`: ```python from nvitop import Device devices = Device.all() # or `Device.cuda.all()` to use CUDA ordinal instead for device in devices: processes = device.processes() # type: Dict[int, GpuProcess] sorted_pids = sorted(processes.keys()) print(device) print(f' - Fan speed: {device.fan_speed()}%') print(f' - Temperature: {device.temperature()}C') print(f' - GPU utilization: {device.gpu_utilization()}%') print(f' - Total memory: {device.memory_total_human()}') print(f' - Used memory: {device.memory_used_human()}') print(f' - Free memory: {device.memory_free_human()}') print(f' - Processes ({len(processes)}): {sorted_pids}') for pid in sorted_pids: print(f' - {processes[pid]}') print('-' * 120) ``` Another more advanced approach with coloring: ```python import time from nvitop import Device, GpuProcess, NA, colored print(colored(time.strftime('%a %b %d %H:%M:%S %Y'), color='red', attrs=('bold',))) devices = Device.cuda.all() # or `Device.all()` to use NVML ordinal instead separator = False for device in devices: processes = device.processes() # type: Dict[int, GpuProcess] print(colored(str(device), color='green', attrs=('bold',))) print(colored(' - Fan speed: ', color='blue', attrs=('bold',)) + f'{device.fan_speed()}%') print(colored(' - Temperature: ', color='blue', attrs=('bold',)) + f'{device.temperature()}C') print(colored(' - GPU utilization: ', color='blue', attrs=('bold',)) + f'{device.gpu_utilization()}%') print(colored(' - Total memory: ', color='blue', attrs=('bold',)) + f'{device.memory_total_human()}') print(colored(' - Used memory: ', color='blue', attrs=('bold',)) + f'{device.memory_used_human()}') print(colored(' - Free memory: ', color='blue', attrs=('bold',)) + f'{device.memory_free_human()}') if len(processes) > 0: processes = GpuProcess.take_snapshots(processes.values(), failsafe=True) processes.sort(key=lambda process: (process.username, process.pid)) print(colored(f' - Processes ({len(processes)}):', color='blue', attrs=('bold',))) fmt = ' {pid:<5} {username:<8} {cpu:>5} {host_memory:>8} {time:>8} {gpu_memory:>8} {sm:>3} {command:<}'.format print(colored(fmt(pid='PID', username='USERNAME', cpu='CPU%', host_memory='HOST-MEM', time='TIME', gpu_memory='GPU-MEM', sm='SM%', command='COMMAND'), attrs=('bold',))) for snapshot in processes: print(fmt(pid=snapshot.pid, username=snapshot.username[:7] + ('+' if len(snapshot.username) > 8 else snapshot.username[7:8]), cpu=snapshot.cpu_percent, host_memory=snapshot.host_memory_human, time=snapshot.running_time_human, gpu_memory=(snapshot.gpu_memory_human if snapshot.gpu_memory_human is not NA else 'WDDM:N/A'), sm=snapshot.gpu_sm_utilization, command=snapshot.command)) else: print(colored(' - No Running Processes', attrs=('bold',))) if separator: print('-' * 120) separator = True ```

Demo
An example monitoring script built with APIs from nvitop.

------ #### Status Snapshot `nvitop` provides a helper function [`take_snapshots`](https://nvitop.readthedocs.io/en/latest/api/collector.html#nvitop.take_snapshots) to retrieve the status of both GPU devices and GPU processes at once. You can type `help(nvitop.take_snapshots)` in Python REPL for detailed documentation. ```python In [1]: from nvitop import take_snapshots, Device ...: import os ...: os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' ...: os.environ['CUDA_VISIBLE_DEVICES'] = '1,0' # comma-separated integers or UUID strings In [2]: take_snapshots() # equivalent to `take_snapshots(Device.all())` Out[2]: SnapshotResult( devices=[ DeviceSnapshot( real=Device(index=0, ...), ... ), ... ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=Device(index=0, ...), ...), ... ), ... ] ) In [3]: device_snapshots, gpu_process_snapshots = take_snapshots(Device.all()) # type: Tuple[List[DeviceSnapshot], List[GpuProcessSnapshot]] In [4]: device_snapshots, _ = take_snapshots(gpu_processes=False) # ignore process snapshots In [5]: take_snapshots(Device.cuda.all()) # use CUDA device enumeration Out[5]: SnapshotResult( devices=[ CudaDeviceSnapshot( real=CudaDevice(cuda_index=0, nvml_index=1, ...), ... ), CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, nvml_index=0, ...), ... ), ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=0, ...), ...), ... ), ... ] ) In [6]: take_snapshots(Device.cuda(1)) # only Out[6]: SnapshotResult( devices=[ CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, nvml_index=0, ...), ... ) ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=1, ...), ...), ... ), ... ] ) ``` Please refer to section [Low-level APIs](#low-level-apis) for more information. ------ #### Resource Metric Collector [`ResourceMetricCollector`](https://nvitop.readthedocs.io/en/latest/api/collector.html#nvitop.ResourceMetricCollector) is a class that collects resource metrics for host, GPUs and processes running on the GPUs. All metrics will be collected in an asynchronous manner. You can type `help(nvitop.ResourceMetricCollector)` in Python REPL for detailed documentation. ```python In [1]: from nvitop import ResourceMetricCollector, Device ...: import os ...: os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' ...: os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' # comma-separated integers or UUID strings In [2]: collector = ResourceMetricCollector() # log all devices and descendant processes of the current process on the GPUs In [3]: collector = ResourceMetricCollector(root_pids={1}) # log all devices and all GPU processes In [4]: collector = ResourceMetricCollector(devices=Device(0), root_pids={1}) # log and all GPU processes on In [5]: collector = ResourceMetricCollector(devices=Device.cuda.all()) # use the CUDA ordinal In [6]: with collector(tag=''): ...: # Do something ...: collector.collect() # -> Dict[str, float] # key -> '///' { '/host/cpu_percent (%)/mean': 8.967849777683456, '/host/cpu_percent (%)/min': 6.1, '/host/cpu_percent (%)/max': 28.1, ..., '/host/memory_percent (%)/mean': 21.5, '/host/swap_percent (%)/mean': 0.3, '/host/memory_used (GiB)/mean': 91.0136418208109, '/host/load_average (%) (1 min)/mean': 10.251427386878328, '/host/load_average (%) (5 min)/mean': 10.072539414569503, '/host/load_average (%) (15 min)/mean': 11.91126970422139, ..., '/cuda:0 (gpu:3)/memory_used (MiB)/mean': 3.875, '/cuda:0 (gpu:3)/memory_free (MiB)/mean': 11015.562499999998, '/cuda:0 (gpu:3)/memory_total (MiB)/mean': 11019.437500000002, '/cuda:0 (gpu:3)/memory_percent (%)/mean': 0.0, '/cuda:0 (gpu:3)/gpu_utilization (%)/mean': 0.0, '/cuda:0 (gpu:3)/memory_utilization (%)/mean': 0.0, '/cuda:0 (gpu:3)/fan_speed (%)/mean': 22.0, '/cuda:0 (gpu:3)/temperature (C)/mean': 25.0, '/cuda:0 (gpu:3)/power_usage (W)/mean': 19.11166264116916, ..., '/cuda:1 (gpu:2)/memory_used (MiB)/mean': 8878.875, ..., '/cuda:2 (gpu:1)/memory_used (MiB)/mean': 8182.875, ..., '/cuda:3 (gpu:0)/memory_used (MiB)/mean': 9286.875, ..., '/pid:12345/host/cpu_percent (%)/mean': 151.34342772112265, '/pid:12345/host/host_memory (MiB)/mean': 44749.72373447514, '/pid:12345/host/host_memory_percent (%)/mean': 8.675082352111717, '/pid:12345/host/running_time (min)': 336.23803206741576, '/pid:12345/cuda:1 (gpu:4)/gpu_memory (MiB)/mean': 8861.0, '/pid:12345/cuda:1 (gpu:4)/gpu_memory_percent (%)/mean': 80.4, '/pid:12345/cuda:1 (gpu:4)/gpu_memory_utilization (%)/mean': 6.711118172407917, '/pid:12345/cuda:1 (gpu:4)/gpu_sm_utilization (%)/mean': 48.23283397736476, ..., '/duration (s)': 7.247399162035435, '/timestamp': 1655909466.9981883 } ``` The results can be easily logged into [TensorBoard](https://github.com/tensorflow/tensorboard) or a CSV file. For example: ```python import os import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.tensorboard import SummaryWriter from nvitop import CudaDevice, ResourceMetricCollector from nvitop.callbacks.tensorboard import add_scalar_dict # Build networks and prepare datasets ... # Logger and status collector writer = SummaryWriter() collector = ResourceMetricCollector(devices=CudaDevice.all(), # log all visible CUDA devices and use the CUDA ordinal root_pids={os.getpid()}, # only log the descendant processes of the current process interval=1.0) # snapshot interval for background daemon thread # Start training global_step = 0 for epoch in range(num_epoch): with collector(tag='train'): for batch in train_dataset: with collector(tag='batch'): metrics = train(net, batch) global_step += 1 add_scalar_dict(writer, 'train', metrics, global_step=global_step) add_scalar_dict(writer, 'resources', # tag='resources/train/batch/...' collector.collect(), global_step=global_step) add_scalar_dict(writer, 'resources', # tag='resources/train/...' collector.collect(), global_step=epoch) with collector(tag='validate'): metrics = validate(net, validation_dataset) add_scalar_dict(writer, 'validate', metrics, global_step=epoch) add_scalar_dict(writer, 'resources', # tag='resources/validate/...' collector.collect(), global_step=epoch) ``` Another example for logging into a CSV file: ```python import datetime import time import pandas as pd from nvitop import ResourceMetricCollector collector = ResourceMetricCollector(root_pids={1}, interval=2.0) # log all devices and all GPU processes df = pd.DataFrame() with collector(tag='resources'): for _ in range(60): # Do something time.sleep(60) metrics = collector.collect() df_metrics = pd.DataFrame.from_records(metrics, index=[len(df)]) df = pd.concat([df, df_metrics], ignore_index=True) # Flush to CSV file ... df.insert(0, 'time', df['resources/timestamp'].map(datetime.datetime.fromtimestamp)) df.to_csv('results.csv', index=False) ``` You can also daemonize the collector in the background using [`collect_in_background`](https://nvitop.readthedocs.io/en/latest/api/collector.html#nvitop.collect_in_background) or [`ResourceMetricCollector.daemonize`](https://nvitop.readthedocs.io/en/latest/api/collector.html#nvitop.ResourceMetricCollector.daemonize) with callback functions. ```python from nvitop import Device, ResourceMetricCollector, collect_in_background logger = ... def on_collect(metrics): # will be called periodically if logger.is_closed(): # closed manually by user return False logger.log(metrics) return True def on_stop(collector): # will be called only once at stop if not logger.is_closed(): logger.close() # cleanup # Record metrics to the logger in the background every 5 seconds. # It will collect 5-second mean/min/max for each metric. collect_in_background( on_collect, ResourceMetricCollector(Device.cuda.all()), interval=5.0, on_stop=on_stop, ) ``` or simply: ```python ResourceMetricCollector(Device.cuda.all()).daemonize( on_collect, interval=5.0, on_stop=on_stop, ) ``` ------ #### Low-level APIs The full API references can be found at . ##### Device The [device module](https://nvitop.readthedocs.io/en/latest/api/device.html) provides:

Device([index, uuid, bus_id])

Live class of the GPU devices, different from the device snapshots.

PhysicalDevice([index, uuid, bus_id])

Class for physical devices.

MigDevice([index, uuid, bus_id])

Class for MIG devices.

CudaDevice([cuda_index, nvml_index, uuid])

Class for devices enumerated over the CUDA ordinal.

CudaMigDevice([cuda_index, nvml_index, uuid])

Class for CUDA devices that are MIG devices.

parse_cuda_visible_devices([...])

Parse the given CUDA_VISIBLE_DEVICES value into a list of NVML device indices.

normalize_cuda_visible_devices([...])

Parse the given CUDA_VISIBLE_DEVICES value and convert it into a comma-separated string of UUIDs.

```python In [1]: from nvitop import ( ...: host, ...: Device, PhysicalDevice, CudaDevice, ...: parse_cuda_visible_devices, normalize_cuda_visible_devices ...: HostProcess, GpuProcess, ...: NA, ...: ) ...: import os ...: os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' ...: os.environ['CUDA_VISIBLE_DEVICES'] = '9,8,7,6' # comma-separated integers or UUID strings In [2]: Device.driver_version() Out[2]: '525.60.11' In [3]: Device.cuda_driver_version() # the maximum CUDA version supported by the driver (can be different from the CUDA Runtime version) Out[3]: '12.0' In [4]: Device.cuda_runtime_version() # the CUDA Runtime version Out[4]: '11.8' In [5]: Device.count() Out[5]: 10 In [6]: CudaDevice.count() # or `Device.cuda.count()` Out[6]: 4 In [7]: all_devices = Device.all() # all devices on board (physical device) ...: nvidia0, nvidia1 = Device.from_indices([0, 1]) # from physical device indices ...: all_devices Out[7]: [ PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=2, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=3, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=4, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=5, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=6, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=7, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=8, name="GeForce RTX 2080 Ti", total_memory=11019MiB), PhysicalDevice(index=9, name="GeForce RTX 2080 Ti", total_memory=11019MiB) ] In [8]: # NOTE: The function results might be different between calls when the `CUDA_VISIBLE_DEVICES` environment variable has been modified ...: cuda_visible_devices = Device.from_cuda_visible_devices() # from the `CUDA_VISIBLE_DEVICES` environment variable ...: cuda0, cuda1 = Device.from_cuda_indices([0, 1]) # from CUDA device indices (might be different from physical device indices if `CUDA_VISIBLE_DEVICES` is set) ...: cuda_visible_devices = CudaDevice.all() # shortcut to `Device.from_cuda_visible_devices()` ...: cuda_visible_devices = Device.cuda.all() # `Device.cuda` is aliased to `CudaDevice` ...: cuda_visible_devices Out[8]: [ CudaDevice(cuda_index=0, nvml_index=9, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB), CudaDevice(cuda_index=1, nvml_index=8, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB), CudaDevice(cuda_index=2, nvml_index=7, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB), CudaDevice(cuda_index=3, nvml_index=6, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB) ] In [9]: nvidia0 = Device(0) # from device index (or `Device(index=0)`) ...: nvidia0 Out[9]: PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB) In [10]: nvidia1 = Device(uuid='GPU-01234567-89ab-cdef-0123-456789abcdef') # from UUID string (or just `Device('GPU-xxxxxxxx-...')`) ...: nvidia2 = Device(bus_id='00000000:06:00.0') # from PCI bus ID ...: nvidia1 Out[10]: PhysicalDevice(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB) In [11]: cuda0 = CudaDevice(0) # from CUDA device index (equivalent to `CudaDevice(cuda_index=0)`) ...: cuda1 = CudaDevice(nvml_index=8) # from physical device index ...: cuda3 = CudaDevice(uuid='GPU-xxxxxxxx-...') # from UUID string ...: cuda4 = Device.cuda(4) # `Device.cuda` is aliased to `CudaDevice` ...: cuda0 Out[11]: CudaDevice(cuda_index=0, nvml_index=9, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB) In [12]: nvidia0.memory_used() # in bytes Out[12]: 9293398016 In [13]: nvidia0.memory_used_human() Out[13]: '8862MiB' In [14]: nvidia0.gpu_utilization() # in percentage Out[14]: 5 In [15]: nvidia0.processes() # type: Dict[int, GpuProcess] Out[15]: { 52059: GpuProcess(pid=52059, gpu_memory=7885MiB, type=C, device=PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=52059, name='ipython3', status='sleeping', started='14:31:22')), 53002: GpuProcess(pid=53002, gpu_memory=967MiB, type=C, device=PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=53002, name='python', status='running', started='14:31:59')) } In [16]: nvidia1_snapshot = nvidia1.as_snapshot() ...: nvidia1_snapshot Out[16]: PhysicalDeviceSnapshot( real=PhysicalDevice(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), bus_id='00000000:05:00.0', compute_mode='Default', clock_infos=ClockInfos(graphics=1815, sm=1815, memory=6800, video=1680), # in MHz clock_speed_infos=ClockSpeedInfos(current=ClockInfos(graphics=1815, sm=1815, memory=6800, video=1680), max=ClockInfos(graphics=2100, sm=2100, memory=7000, video=1950)), # in MHz cuda_compute_capability=(7, 5), current_driver_model='N/A', decoder_utilization=0, # in percentage display_active='Disabled', display_mode='Disabled', encoder_utilization=0, # in percentage fan_speed=22, # in percentage gpu_utilization=17, # in percentage (NOTE: this is the utilization rate of SMs, i.e. GPU percent) index=1, max_clock_infos=ClockInfos(graphics=2100, sm=2100, memory=7000, video=1950), # in MHz memory_clock=6800, # in MHz memory_free=10462232576, # in bytes memory_free_human='9977MiB', memory_info=MemoryInfo(total=11554717696, free=10462232576, used=1092485120) # in bytes memory_percent=9.5, # in percentage (NOTE: this is the percentage of used GPU memory) memory_total=11554717696, # in bytes memory_total_human='11019MiB', memory_usage='1041MiB / 11019MiB', memory_used=1092485120, # in bytes memory_used_human='1041MiB', memory_utilization=7, # in percentage (NOTE: this is the utilization rate of GPU memory bandwidth) mig_mode='N/A', name='GeForce RTX 2080 Ti', pcie_rx_throughput=1000, # in KiB/s pcie_rx_throughput_human='1000KiB/s', pcie_throughput=ThroughputInfo(tx=1000, rx=1000), # in KiB/s pcie_tx_throughput=1000, # in KiB/s pcie_tx_throughput_human='1000KiB/s', performance_state='P2', persistence_mode='Disabled', power_limit=250000, # in milliwatts (mW) power_status='66W / 250W', # in watts (W) power_usage=66051, # in milliwatts (mW) sm_clock=1815, # in MHz temperature=39, # in Celsius total_volatile_uncorrected_ecc_errors='N/A', utilization_rates=UtilizationRates(gpu=17, memory=7, encoder=0, decoder=0), # in percentage uuid='GPU-01234567-89ab-cdef-0123-456789abcdef', ) In [17]: nvidia1_snapshot.memory_percent # snapshot uses properties instead of function calls Out[17]: 9.5 In [18]: nvidia1_snapshot['memory_info'] # snapshot also supports `__getitem__` by string Out[18]: MemoryInfo(total=11554717696, free=10462232576, used=1092485120) In [19]: nvidia1_snapshot.bar1_memory_info # snapshot will automatically retrieve not presented attributes from `real` Out[19]: MemoryInfo(total=268435456, free=257622016, used=10813440) ``` **NOTE:** Some entry values may be `'N/A'` (type: [`NaType`](https://nvitop.readthedocs.io/en/latest/index.html#nvitop.NaType), a subclass of `str`) when the corresponding resources are not applicable. The [`NA`](https://nvitop.readthedocs.io/en/latest/index.html#nvitop.NA) value supports arithmetic operations. It acts like `math.nan: float`. ```python >>> from nvitop import NA >>> NA 'N/A' >>> 'memory usage: {}'.format(NA) # NA is an instance of `str` 'memory usage: N/A' >>> NA.lower() # NA is an instance of `str` 'n/a' >>> NA.ljust(5) # NA is an instance of `str` 'N/A ' >>> NA + 'str' # string contamination if the operand is a string 'N/Astr' >>> float(NA) # explicit conversion to float (`math.nan`) nan >>> NA + 1 # auto-casting to float if the operand is a number nan >>> NA * 1024 # auto-casting to float if the operand is a number nan >>> NA / (1024 * 1024) # auto-casting to float if the operand is a number nan ``` You can use `entry != 'N/A'` conditions to avoid exceptions. It's safe to use `float(entry)` for numbers while `NaType` will be converted to `math.nan`. For example: ```python memory_used: Union[int, NaType] = device.memory_used() # memory usage in bytes or `'N/A'` memory_used_in_mib: float = float(memory_used) / (1 << 20) # memory usage in Mebibytes (MiB) or `math.nan` ``` It's safe to compare `NaType` with numbers, but `NaType` is always larger than any number: ```python devices_by_used_memory = sorted(Device.all(), key=Device.memory_used, reverse=True) # it's safe to compare `'N/A'` with numbers devices_by_free_memory = sorted(Device.all(), key=Device.memory_free, reverse=True) # please add `memory_free != 'N/A'` checks if sort in descending order here ``` See [`nvitop.NaType`](https://nvitop.readthedocs.io/en/latest/apis/index.html#nvitop.NaType) documentation for more details. ##### Process The [process module](https://nvitop.readthedocs.io/en/latest/api/process.html) provides:

HostProcess([pid])

Represents an OS process with the given PID.

GpuProcess(pid, device[, gpu_memory, ...])

Represents a process with the given PID running on the given GPU device.

command_join(cmdline)

Returns a shell-escaped string from command line arguments.

```python In [20]: processes = nvidia1.processes() # type: Dict[int, GpuProcess] ...: processes Out[20]: { 23266: GpuProcess(pid=23266, gpu_memory=1031MiB, type=C, device=Device(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=23266, name='python3', status='running', started='2021-05-10 21:02:40')) } In [21]: process = processes[23266] ...: process Out[21]: GpuProcess(pid=23266, gpu_memory=1031MiB, type=C, device=Device(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=23266, name='python3', status='running', started='2021-05-10 21:02:40')) In [22]: process.status() # GpuProcess will automatically inherit attributes from GpuProcess.host Out[22]: 'running' In [23]: process.cmdline() # type: List[str] Out[23]: ['python3', 'rllib_train.py'] In [24]: process.command() # type: str Out[24]: 'python3 rllib_train.py' In [25]: process.cwd() # GpuProcess will automatically inherit attributes from GpuProcess.host Out[25]: '/home/xxxxxx/Projects/xxxxxx' In [26]: process.gpu_memory_human() Out[26]: '1031MiB' In [27]: process.as_snapshot() Out[27]: GpuProcessSnapshot( real=GpuProcess(pid=23266, gpu_memory=1031MiB, type=C, device=PhysicalDevice(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=23266, name='python3', status='running', started='2021-05-10 21:02:40')), cmdline=['python3', 'rllib_train.py'], command='python3 rllib_train.py', compute_instance_id='N/A', cpu_percent=98.5, # in percentage device=PhysicalDevice(index=1, name="GeForce RTX 2080 Ti", total_memory=11019MiB), gpu_encoder_utilization=0, # in percentage gpu_decoder_utilization=0, # in percentage gpu_instance_id='N/A', gpu_memory=1081081856, # in bytes gpu_memory_human='1031MiB', gpu_memory_percent=9.4, # in percentage (NOTE: this is the percentage of used GPU memory) gpu_memory_utilization=5, # in percentage (NOTE: this is the utilization rate of GPU memory bandwidth) gpu_sm_utilization=0, # in percentage (NOTE: this is the utilization rate of SMs, i.e. GPU percent) host=HostProcessSnapshot( real=HostProcess(pid=23266, name='python3', status='running', started='2021-05-10 21:02:40'), cmdline=['python3', 'rllib_train.py'], command='python3 rllib_train.py', cpu_percent=98.5, # in percentage host_memory=9113627439, # in bytes host_memory_human='8691MiB', is_running=True, memory_percent=1.6849018430285683, # in percentage name='python3', running_time=datetime.timedelta(days=1, seconds=80013, microseconds=470024), running_time_human='46:13:33', running_time_in_seconds=166413.470024, status='running', username='panxuehai', ), host_memory=9113627439, # in bytes host_memory_human='8691MiB', is_running=True, memory_percent=1.6849018430285683, # in percentage (NOTE: this is the percentage of used host memory) name='python3', pid=23266, running_time=datetime.timedelta(days=1, seconds=80013, microseconds=470024), running_time_human='46:13:33', running_time_in_seconds=166413.470024, status='running', type='C', # 'C' for Compute / 'G' for Graphics / 'C+G' for Both username='panxuehai', ) In [28]: process.uids() # GpuProcess will automatically inherit attributes from GpuProcess.host Out[28]: puids(real=1001, effective=1001, saved=1001) In [29]: process.kill() # GpuProcess will automatically inherit attributes from GpuProcess.host In [30]: list(map(Device.processes, all_devices)) # all processes Out[30]: [ { 52059: GpuProcess(pid=52059, gpu_memory=7885MiB, type=C, device=PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=52059, name='ipython3', status='sleeping', started='14:31:22')), 53002: GpuProcess(pid=53002, gpu_memory=967MiB, type=C, device=PhysicalDevice(index=0, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=53002, name='python', status='running', started='14:31:59')) }, {}, {}, {}, {}, {}, {}, {}, { 84748: GpuProcess(pid=84748, gpu_memory=8975MiB, type=C, device=PhysicalDevice(index=8, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=84748, name='python', status='running', started='11:13:38')) }, { 84748: GpuProcess(pid=84748, gpu_memory=8341MiB, type=C, device=PhysicalDevice(index=9, name="GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=84748, name='python', status='running', started='11:13:38')) } ] In [31]: this = HostProcess(os.getpid()) ...: this Out[31]: HostProcess(pid=35783, name='python', status='running', started='19:19:00') In [32]: this.cmdline() # type: List[str] Out[32]: ['python', '-c', 'import IPython; IPython.terminal.ipapp.launch_new_instance()'] In [33]: this.command() # not simply `' '.join(cmdline)` but quotes are added Out[33]: 'python -c "import IPython; IPython.terminal.ipapp.launch_new_instance()"' In [34]: this.memory_info() Out[34]: pmem(rss=83988480, vms=343543808, shared=12079104, text=8192, lib=0, data=297435136, dirty=0) In [35]: import cupy as cp ...: x = cp.zeros((10000, 1000)) ...: this = GpuProcess(os.getpid(), cuda0) # construct from `GpuProcess(pid, device)` explicitly rather than calling `device.processes()` ...: this Out[35]: GpuProcess(pid=35783, gpu_memory=N/A, type=N/A, device=CudaDevice(cuda_index=0, nvml_index=9, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=35783, name='python', status='running', started='19:19:00')) In [36]: this.update_gpu_status() # update used GPU memory from new driver queries Out[36]: 267386880 In [37]: this Out[37]: GpuProcess(pid=35783, gpu_memory=255MiB, type=C, device=CudaDevice(cuda_index=0, nvml_index=9, name="NVIDIA GeForce RTX 2080 Ti", total_memory=11019MiB), host=HostProcess(pid=35783, name='python', status='running', started='19:19:00')) In [38]: id(this) == id(GpuProcess(os.getpid(), cuda0)) # IMPORTANT: the instance will be reused while the process is running Out[38]: True ``` ##### Host (inherited from [psutil](https://github.com/giampaolo/psutil)) ```python In [39]: host.cpu_count() Out[39]: 88 In [40]: host.cpu_percent() Out[40]: 18.5 In [41]: host.cpu_times() Out[41]: scputimes(user=2346377.62, nice=53321.44, system=579177.52, idle=10323719.85, iowait=28750.22, irq=0.0, softirq=11566.87, steal=0.0, guest=0.0, guest_nice=0.0) In [42]: host.load_average() Out[42]: (14.88, 17.8, 19.91) In [43]: host.virtual_memory() Out[43]: svmem(total=270352478208, available=192275968000, percent=28.9, used=53350518784, free=88924037120, active=125081112576, inactive=44803993600, buffers=37006450688, cached=91071471616, shared=23820632064, slab=8200687616) In [44]: host.memory_percent() Out[44]: 28.9 In [45]: host.swap_memory() Out[45]: sswap(total=65534947328, used=475136, free=65534472192, percent=0.0, sin=2404139008, sout=4259434496) In [46]: host.swap_percent() Out[46]: 0.0 ``` ------ ## Screenshots ![Screen Recording](https://user-images.githubusercontent.com/16078332/113173772-508dc380-927c-11eb-84c5-b6f496e54c08.gif) Example output of `nvitop -1`:

Screenshot

Example output of `nvitop`:
Full Compact
Full Compact
Tree-view screen (shortcut: t) for GPU processes and their ancestors:

Tree-view

**NOTE:** The process tree is built in backward order (recursively back to the tree root). Only GPU processes along with their children and ancestors (parents and grandparents ...) will be shown. Not all running processes will be displayed. Environment variable screen (shortcut: e):

Environment Screen

Spectrum-like bar charts (with option --colorful):

Spectrum-like Bar Charts

------ ## Changelog See [CHANGELOG.md](https://github.com/XuehaiPan/nvitop/blob/HEAD/CHANGELOG.md). ------ ## License The source code of `nvitop` is dual-licensed by the **Apache License, Version 2.0 (Apache-2.0)** and **GNU General Public License, Version 3 (GPL-3.0)**. The `nvitop` CLI is released under the **GPL-3.0** license while the remaining part of `nvitop` is released under the **Apache-2.0** license. The license files can be found at [LICENSE](https://github.com/XuehaiPan/nvitop/blob/HEAD/LICENSE) (Apache-2.0) and [COPYING](https://github.com/XuehaiPan/nvitop/blob/HEAD/COPYING) (GPL-3.0). The source code is organized as: ```text nvitop (GPL-3.0) ├── __init__.py (Apache-2.0) ├── version.py (Apache-2.0) ├── api (Apache-2.0) │ ├── LICENSE (Apache-2.0) │ └── * (Apache-2.0) ├── callbacks (Apache-2.0) │ ├── LICENSE (Apache-2.0) │ └── * (Apache-2.0) ├── select.py (Apache-2.0) ├── __main__.py (GPL-3.0) ├── cli.py (GPL-3.0) └── tui (GPL-3.0) ├── COPYING (GPL-3.0) └── * (GPL-3.0) ``` ### Copyright Notice Please feel free to use `nvitop` as a dependency for your own projects. The following Python import statements are permitted: ```python import nvitop import nvitop as alias import nvitop.api as api import nvitop.device as device from nvitop import * from nvitop.api import * from nvitop import Device, ResourceMetricCollector ``` The public APIs from `nvitop` are released under the **Apache License, Version 2.0 (Apache-2.0)**. The original license files can be found at [LICENSE](https://github.com/XuehaiPan/nvitop/blob/HEAD/LICENSE), [nvitop/api/LICENSE](https://github.com/XuehaiPan/nvitop/blob/HEAD/nvitop/api/LICENSE), and [nvitop/callbacks/LICENSE](https://github.com/XuehaiPan/nvitop/blob/HEAD/nvitop/callbacks/LICENSE). The CLI of `nvitop` is released under the **GNU General Public License, Version 3 (GPL-3.0)**. The original license files can be found at [COPYING](https://github.com/XuehaiPan/nvitop/blob/HEAD/COPYING) and [nvitop/tui/COPYING](https://github.com/XuehaiPan/nvitop/blob/HEAD/nvitop/tui/COPYING). If you dynamically load the source code of `nvitop`'s CLI or TUI: ```python from nvitop import cli from nvitop import tui import nvitop.cli import nvitop.tui ``` your source code should also be released under the GPL-3.0 License. If you want to add or modify some features of `nvitop`'s CLI, or copy some source code of `nvitop`'s CLI into your own code, the source code should also be released under the GPL-3.0 License (as `nvitop` contains some modified source code from [ranger](https://github.com/ranger/ranger) under the GPL-3.0 License). nvitop-1.4.2/docs/000077500000000000000000000000001474547113600140015ustar00rootroot00000000000000nvitop-1.4.2/docs/Makefile000066400000000000000000000011761474547113600154460ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) nvitop-1.4.2/docs/README.md000066400000000000000000000011641474547113600152620ustar00rootroot00000000000000# The `nvitop`'s Documentation This directory contains the documentation of `nvitop`, the one-stop solution for GPU process management. ### Requirements - `sphinx` - `sphinx-autoapi` - `sphinx-autobuild` - `sphinx-copybutton` - `sphinx-rtd-theme` - `make` ### Steps to build the documentation ```bash cd docs # navigate to this directory python3 -m venv --upgrade-deps .venv source .venv/bin/activate pip3 install -r ../requirements.txt -r requirements.txt sphinx-autobuild --watch ../nvitop --open-browser source build ``` nvitop-1.4.2/docs/make.bat000066400000000000000000000014441474547113600154110ustar00rootroot00000000000000@ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd nvitop-1.4.2/docs/requirements.txt000066400000000000000000000002721474547113600172660ustar00rootroot00000000000000sphinx >= 5.0 sphinx-autoapi sphinx-autobuild sphinx-copybutton sphinx-rtd-theme lightning >= 2.0.0, < 3.0.0a0 pytorch-lightning >= 1.5.0, < 2.0.0a0 tensorflow-cpu >= 2.0.0, < 2.20.0a0 nvitop-1.4.2/docs/source/000077500000000000000000000000001474547113600153015ustar00rootroot00000000000000nvitop-1.4.2/docs/source/_static/000077500000000000000000000000001474547113600167275ustar00rootroot00000000000000nvitop-1.4.2/docs/source/_static/.gitkeep000066400000000000000000000000001474547113600203460ustar00rootroot00000000000000nvitop-1.4.2/docs/source/_static/style.css000066400000000000000000000000511474547113600205750ustar00rootroot00000000000000.wy-nav-content { max-width: none; } nvitop-1.4.2/docs/source/_templates/000077500000000000000000000000001474547113600174365ustar00rootroot00000000000000nvitop-1.4.2/docs/source/_templates/.gitkeep000066400000000000000000000000001474547113600210550ustar00rootroot00000000000000nvitop-1.4.2/docs/source/api/000077500000000000000000000000001474547113600160525ustar00rootroot00000000000000nvitop-1.4.2/docs/source/api/caching.rst000066400000000000000000000002141474547113600201750ustar00rootroot00000000000000nvitop.caching module --------------------- .. currentmodule:: nvitop .. autosummary:: ttl_cache .. autofunction:: nvitop.ttl_cache nvitop-1.4.2/docs/source/api/collector.rst000066400000000000000000000007651474547113600206020ustar00rootroot00000000000000nvitop.collector module ----------------------- .. currentmodule:: nvitop .. autosummary:: take_snapshots collect_in_background ResourceMetricCollector ResourceMetricCollector.daemonize .. automodule:: nvitop.collector :no-members: .. autofunction:: nvitop.take_snapshots .. autofunction:: nvitop.collect_in_background .. autoclass:: nvitop.ResourceMetricCollector :members: :inherited-members: :undoc-members: :show-inheritance: :member-order: bysource nvitop-1.4.2/docs/source/api/device.rst000066400000000000000000000017321474547113600200460ustar00rootroot00000000000000nvitop.device module -------------------- .. currentmodule:: nvitop .. autosummary:: Device PhysicalDevice MigDevice CudaDevice CudaMigDevice parse_cuda_visible_devices normalize_cuda_visible_devices .. automodule:: nvitop.device :no-members: .. autoclass:: nvitop.Device :members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.PhysicalDevice :members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.MigDevice :members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.CudaDevice :members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.CudaMigDevice :members: :undoc-members: :show-inheritance: :member-order: bysource .. autofunction:: nvitop.parse_cuda_visible_devices .. autofunction:: nvitop.normalize_cuda_visible_devices nvitop-1.4.2/docs/source/api/host.rst000066400000000000000000000002301474547113600175540ustar00rootroot00000000000000nvitop.host module ------------------ .. automodule:: nvitop.host :members: :undoc-members: :show-inheritance: :member-order: bysource nvitop-1.4.2/docs/source/api/libcuda.rst000066400000000000000000000002411474547113600202040ustar00rootroot00000000000000nvitop.libcuda module --------------------- .. automodule:: nvitop.libcuda :members: :undoc-members: :show-inheritance: :member-order: bysource nvitop-1.4.2/docs/source/api/libcudart.rst000066400000000000000000000002471474547113600205600ustar00rootroot00000000000000nvitop.libcudart module ----------------------- .. automodule:: nvitop.libcudart :members: :undoc-members: :show-inheritance: :member-order: bysource nvitop-1.4.2/docs/source/api/libnvml.rst000066400000000000000000000002411474547113600202440ustar00rootroot00000000000000nvitop.libnvml module --------------------- .. automodule:: nvitop.libnvml :members: :undoc-members: :show-inheritance: :member-order: bysource nvitop-1.4.2/docs/source/api/process.rst000066400000000000000000000010011474547113600202520ustar00rootroot00000000000000nvitop.process module --------------------- .. currentmodule:: nvitop .. autosummary:: HostProcess GpuProcess command_join .. automodule:: nvitop.process :no-members: .. autoclass:: nvitop.HostProcess :members: :inherited-members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.GpuProcess :members: :inherited-members: :undoc-members: :show-inheritance: :member-order: bysource .. autofunction:: nvitop.command_join nvitop-1.4.2/docs/source/api/utils.rst000066400000000000000000000003361474547113600177460ustar00rootroot00000000000000nvitop.utils module ------------------- .. automodule:: nvitop.utils :members: :undoc-members: :show-inheritance: :member-order: bysource :exclude-members: NA, NaType, NotApplicable, NotApplicableType nvitop-1.4.2/docs/source/callbacks.rst000066400000000000000000000020371474547113600177540ustar00rootroot00000000000000nvitop.callbacks package ======================== Submodules ---------- nvitop.callbacks.keras module ----------------------------- .. automodule:: nvitop.callbacks.keras :members: :undoc-members: :show-inheritance: nvitop.callbacks.lightning module --------------------------------- .. automodule:: nvitop.callbacks.lightning :members: :undoc-members: :show-inheritance: nvitop.callbacks.pytorch\_lightning module ------------------------------------------ .. automodule:: nvitop.callbacks.pytorch_lightning :members: :undoc-members: :show-inheritance: nvitop.callbacks.tensorboard module ----------------------------------- .. automodule:: nvitop.callbacks.tensorboard :members: :undoc-members: :show-inheritance: nvitop.callbacks.utils module ----------------------------- .. automodule:: nvitop.callbacks.utils :members: :undoc-members: :show-inheritance: Module contents --------------- .. automodule:: nvitop.callbacks :members: :undoc-members: :show-inheritance: nvitop-1.4.2/docs/source/conf.py000066400000000000000000000114031474547113600165770ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # pylint: disable=all # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) # -- Project information ----------------------------------------------------- project = 'nvitop: the one-stop solution for GPU process management.' copyright = '2021-2025, Xuehai Pan' author = 'Xuehai Pan' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'sphinx.ext.extlinks', 'sphinx_copybutton', 'sphinx_rtd_theme', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'default' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_css_files = ['style.css'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} extlinks = { 'gitcode': ('https://github.com/XuehaiPan/nvitop/blob/HEAD/%s', '%s'), 'issue': ('https://github.com/XuehaiPan/nvitop/issues/%s', 'issue %s'), } # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, 'nvitop', 'An interactive NVIDIA-GPU process viewer.', [author], 1)] # -- Extension configuration ------------------------------------------------- # -- Options for napoleon extension ------------------------------------------ napoleon_include_init_with_doc = True napoleon_include_private_with_doc = False napoleon_include_special_with_doc = True # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True # -- Options for copybutton extension ---------------------------------------- # To make sphinx-copybutton skip all prompt characters generated by pygments copybutton_exclude = '.linenos, .gp' nvitop-1.4.2/docs/source/fix-psutil-docstring.sh000077500000000000000000000002641474547113600217400ustar00rootroot00000000000000#!/usr/bin/env bash # shellcheck disable=SC2312 exec sed -i -E 's/^ process identity for every yielded instance$/ \0/' "$(python3 -c "print(__import__('psutil').__file__)")" nvitop-1.4.2/docs/source/index.rst000066400000000000000000000223061474547113600171450ustar00rootroot00000000000000Welcome to nvitop's documentation! ================================== |GitHub|_ |Python Version|_ |PyPI Package|_ |Conda Package|_ |Documentation Status|_ |Downloads|_ |GitHub Repo Stars|_ |License|_ An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management. .. figure:: https://user-images.githubusercontent.com/16078332/171005261-1aad126e-dc27-4ed3-a89b-7f9c1c998bf7.png :align: center The CLI from ``nvitop``. .. |GitHub| image:: https://img.shields.io/badge/GitHub-Homepage-blue?logo=github .. _GitHub: https://github.com/XuehaiPan/nvitop .. |Python Version| image:: https://img.shields.io/badge/Python-3.7%2B-brightgreen .. _Python Version: https://pypi.org/project/nvitop .. |PyPI Package| image:: https://img.shields.io/pypi/v/nvitop?label=pypi&logo=pypi .. _PyPI Package: https://pypi.org/project/nvitop .. |Conda Package| image:: https://img.shields.io/conda/vn/conda-forge/nvitop?label=conda&logo=condaforge .. _Conda Package: https://anaconda.org/conda-forge/nvitop .. |Conda-forge Package| image:: https://img.shields.io/conda/v/conda-forge/nvitop?logo=condaforge .. _Conda-forge Package: https://anaconda.org/conda-forge/nvitop .. |Documentation Status| image:: https://img.shields.io/readthedocs/nvitop?label=docs&logo=readthedocs .. _Documentation Status: https://nvitop.readthedocs.io .. |Downloads| image:: https://static.pepy.tech/personalized-badge/nvitop?period=total&left_color=grey&right_color=blue&left_text=downloads .. _Downloads: https://pepy.tech/project/nvitop .. |GitHub Repo Stars| image:: https://img.shields.io/github/stars/XuehaiPan/nvitop?label=stars&logo=github&color=brightgreen .. _GitHub Repo Stars: https://github.com/XuehaiPan/nvitop .. |License| image:: https://img.shields.io/github/license/XuehaiPan/nvitop?label=license&logo= .. _License: https://github.com/XuehaiPan/nvitop#license ------ Installation """""""""""" **It is highly recommended to install nvitop in an isolated virtual environment.** Simple installation and run via `pipx `_: .. code:: bash pipx run nvitop Install from PyPI (|PyPI Package|_): .. code:: bash pip3 install --upgrade nvitop .. note:: Python 3.7+ is required, and Python versions lower than 3.7 is not supported. Install from conda-forge (|Conda-forge Package|_): .. code:: bash conda install -c conda-forge nvitop Install the latest version from GitHub (|Commit Count|): .. code:: bash pip3 install --upgrade pip setuptools pip3 install git+https://github.com/XuehaiPan/nvitop.git#egg=nvitop Or, clone this repo and install manually: .. code:: bash git clone --depth=1 https://github.com/XuehaiPan/nvitop.git cd nvitop pip3 install . If this repo is useful to you, please star ⭐️ it to let more people know 🤗. |GitHub Repo Stars|_ .. |Commit Count| image:: https://img.shields.io/github/commits-since/XuehaiPan/nvitop/v1.4.2 ------ Quick Start """"""""""" A minimal script to monitor the GPU devices based on APIs from ``nvitop``: .. code-block:: python from nvitop import Device devices = Device.all() # or Device.cuda.all() for device in devices: processes = device.processes() # type: Dict[int, GpuProcess] sorted_pids = sorted(processes) print(device) print(f' - Fan speed: {device.fan_speed()}%') print(f' - Temperature: {device.temperature()}C') print(f' - GPU utilization: {device.gpu_utilization()}%') print(f' - Total memory: {device.memory_total_human()}') print(f' - Used memory: {device.memory_used_human()}') print(f' - Free memory: {device.memory_free_human()}') print(f' - Processes ({len(processes)}): {sorted_pids}') for pid in sorted_pids: print(f' - {processes[pid]}') print('-' * 120) Another more advanced approach with coloring: .. code-block:: python import time from nvitop import Device, GpuProcess, NA, colored print(colored(time.strftime('%a %b %d %H:%M:%S %Y'), color='red', attrs=('bold',))) devices = Device.cuda.all() # or `Device.all()` to use NVML ordinal instead separator = False for device in devices: processes = device.processes() # type: Dict[int, GpuProcess] print(colored(str(device), color='green', attrs=('bold',))) print(colored(' - Fan speed: ', color='blue', attrs=('bold',)) + f'{device.fan_speed()}%') print(colored(' - Temperature: ', color='blue', attrs=('bold',)) + f'{device.temperature()}C') print(colored(' - GPU utilization: ', color='blue', attrs=('bold',)) + f'{device.gpu_utilization()}%') print(colored(' - Total memory: ', color='blue', attrs=('bold',)) + f'{device.memory_total_human()}') print(colored(' - Used memory: ', color='blue', attrs=('bold',)) + f'{device.memory_used_human()}') print(colored(' - Free memory: ', color='blue', attrs=('bold',)) + f'{device.memory_free_human()}') if len(processes) > 0: processes = GpuProcess.take_snapshots(processes.values(), failsafe=True) processes.sort(key=lambda process: (process.username, process.pid)) print(colored(f' - Processes ({len(processes)}):', color='blue', attrs=('bold',))) fmt = ' {pid:<5} {username:<8} {cpu:>5} {host_memory:>8} {time:>8} {gpu_memory:>8} {sm:>3} {command:<}'.format print(colored(fmt(pid='PID', username='USERNAME', cpu='CPU%', host_memory='HOST-MEM', time='TIME', gpu_memory='GPU-MEM', sm='SM%', command='COMMAND'), attrs=('bold',))) for snapshot in processes: print(fmt(pid=snapshot.pid, username=snapshot.username[:7] + ('+' if len(snapshot.username) > 8 else snapshot.username[7:8]), cpu=snapshot.cpu_percent, host_memory=snapshot.host_memory_human, time=snapshot.running_time_human, gpu_memory=(snapshot.gpu_memory_human if snapshot.gpu_memory_human is not NA else 'WDDM:N/A'), sm=snapshot.gpu_sm_utilization, command=snapshot.command)) else: print(colored(' - No Running Processes', attrs=('bold',))) if separator: print('-' * 120) separator = True .. figure:: https://user-images.githubusercontent.com/16078332/177041142-fe988d58-6a97-4559-84fd-b51204cf9231.png :align: center An example monitoring script built with APIs from ``nvitop``. Please refer to section `More than a Monitor `_ in README for more examples. ------ .. toctree:: :maxdepth: 4 :caption: API Reference api/device api/process api/host api/collector api/libnvml api/libcuda api/libcudart api/caching api/utils select callbacks ------ Module Contents """"""""""""""" .. automodule:: nvitop.version :members: :undoc-members: :show-inheritance: :member-order: bysource .. autoclass:: nvitop.NaType :members: :undoc-members: :show-inheritance: :member-order: bysource .. autodata:: nvitop.NA .. autoclass:: nvitop.NotApplicableType .. autodata:: nvitop.NotApplicable .. automodule:: nvitop :members: :undoc-members: :show-inheritance: :member-order: bysource :noindex: Device :exclude-members: NA, NaType, NotApplicable, NotApplicableType ------ Indices and Tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` nvitop-1.4.2/docs/source/select.rst000066400000000000000000000002021474547113600173040ustar00rootroot00000000000000nvitop.select module -------------------- .. automodule:: nvitop.select :members: :undoc-members: :show-inheritance: nvitop-1.4.2/docs/source/spelling_wordlist.txt000066400000000000000000000021151474547113600216050ustar00rootroot00000000000000nvitop nvisel Xuehai Pan panxuehai pyproject toml txt GPL fmt Redhat WSL enum isinstance str bool boolean ints args kwargs py os sys nan func divmod ord cwd environ lstrip rstrip const globals Iterable iterable psutil cmdline oneshot ctx IPython libcuda libcudart libnvml nvidia nvidia-smi smi pynvml PID pid GPU gpu GPUs GPU's gpus MIG mig runtime RTX TTC TCC WDM WDDM ECC ecc SM sm shader PCI pci pstate xxxxxx milliwatts linters docstring Cuda cuda CUDA CUDAError NVML NVMLError CudaDevice PhysicalDevice MigDevice GpuProcess DeviceSnapshot GpuProcessSnapshot ResourceMetricCollector UUID uuid rw ppid uid uids gids RSS NoSuchProcess AccessDenied fallbacks memoize csv noheader nounits xargs superset subprocess MiB GiB Failsafe failsafe unicode Traceback Subclasses Displayables WideString tensorflow Keras keras Xception fg bg attr esc Ctrl ascii colorscheme addstr getch bstate getmouse uncase lol xx yyy zz CLI submodule submodules namespace noqa uptime ot oT mypy struct MPS KMD conf Unallocated KiB tx rx ThroughputInfo pytorch api utils GpuStatsLogger hostname len maxsize reentrant env tty nvitop-1.4.2/install-nvidia-driver.sh000077500000000000000000000362661474547113600176340ustar00rootroot00000000000000#!/bin/bash # ============================================================================== # # Usage: bash install-nvidia-driver.sh [--package=PKG] [--upgrade-only] [--latest] [--dry-run] [--yes] [--help] # # Examples: # # bash install-nvidia-driver.sh # bash install-nvidia-driver.sh --package=nvidia-driver-470 # bash install-nvidia-driver.sh --upgrade-only # bash install-nvidia-driver.sh --latest # # ============================================================================== # This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # shellcheck disable=SC2016,SC2312 set -u set -e shopt -s inherit_errexit &>/dev/null || true function abort() { echo "$@" >&2 exit 1 } # shellcheck disable=2292 if [ -z "${BASH_VERSION:-}" ]; then abort "Bash is required to interpret this script." fi export LANGUAGE="C:en" # shellcheck disable=1091 if [[ "$(uname -s)" != "Linux" ]] || (source /etc/os-release && [[ "${NAME:-}" != "Ubuntu" ]]); then abort "This script only supports Ubuntu Linux." elif [[ -n "${WSL_DISTRO_NAME:-}" ]] || (uname -r | grep -qiF 'microsoft'); then abort "Please install the NVIDIA driver in the Windows host system rather than in WSL." fi # String formatters if [[ -t 1 ]]; then tty_escape() { printf "\033[%sm" "$1"; } else tty_escape() { :; } fi tty_mkbold() { tty_escape "1;$1"; } tty_blue="$(tty_mkbold 34)" tty_red="$(tty_mkbold 31)" tty_green="$(tty_mkbold 32)" tty_yellow="$(tty_mkbold 33)" tty_white="$(tty_mkbold 37)" tty_bold="$(tty_mkbold 39)" tty_reset="$(tty_escape 0)" function usage() { cat <&2 echo >&2 abort "Invalid option '${arg}'" ;; esac done ### Functions ###################################################################################### function apt-list-packages() { dpkg-query --show --showformat='${Installed-Size} ${binary:Package} ${Version} ${Status}\n' | grep -v deinstall | sort -n | awk '{ print $1" "$2" "$3 }' } function apt-list-nvidia-packages() { local packages packages="$( apt-list-packages | awk '$2 ~ /nvidia.*-([0-9]+)(:.*)?$/ { print $2 }' | sort )" echo "${packages//$'\n'/ }" } function apt-installed-version() { apt-cache policy "$1" | grep -F 'Installed' | awk '{ print $2 }' } function apt-candidate-version() { apt-cache policy "$1" | grep -F 'Candidate' | awk '{ print $2 }' } # From https://github.com/XuehaiPan/Dev-Setup.git function exec_cmd() { printf "%s" "$@" | awk \ 'BEGIN { RESET = "\033[0m"; BOLD = "\033[1m"; UNDERLINE = "\033[4m"; UNDERLINEOFF = "\033[24m"; RED = "\033[31m"; GREEN = "\033[32m"; YELLOW = "\033[33m"; WHITE = "\033[37m"; GRAY = "\033[90m"; IDENTIFIER = "[_a-zA-Z][_a-zA-Z0-9]*"; idx = 0; in_string = 0; double_quoted = 1; printf("%s$", BOLD WHITE); } { for (i = 1; i <= NF; ++i) { style = WHITE; post_style = WHITE; if (!in_string) { if ($i ~ /^-/) style = YELLOW; else if ($i == "sudo" && idx == 0) { style = UNDERLINE GREEN; post_style = UNDERLINEOFF WHITE; } else if ($i ~ "^" IDENTIFIER "=" && idx == 0) { style = GRAY; '"if (\$i ~ \"^\" IDENTIFIER \"=[\\\"']\") {"' in_string = 1; double_quoted = ($i ~ "^" IDENTIFIER "=\""); } } else if ($i ~ /^[12&]?>>?/ || $i == "\\") style = RED; else { ++idx; '"if (\$i ~ /^[\"']/) {"' in_string = 1; double_quoted = ($i ~ /^"/); } if (idx == 1) style = GREEN; } } if (in_string) { if (style == WHITE) style = ""; post_style = ""; '"if ((double_quoted && \$i ~ /\";?\$/ && \$i !~ /\\\\\";?\$/) || (!double_quoted && \$i ~ /';?\$/))"' in_string = 0; } if (($i ~ /;$/ && $i !~ /\\;$/) || $i == "|" || $i == "||" || $i == "&&") { if (!in_string) { idx = 0; if ($i !~ /;$/) style = RED; } } if ($i ~ /;$/ && $i !~ /\\;$/) printf(" %s%s%s;%s", style, substr($i, 1, length($i) - 1), (in_string ? WHITE : RED), post_style); else printf(" %s%s%s", style, $i, post_style); if ($i == "\\") printf("\n\t"); } } END { printf("%s\n", RESET); }' >&2 # shellcheck disable=SC2294 eval "$@" } # From https://github.com/Homebrew/install.git shell_join() { local arg printf "%s" "$1" shift for arg in "$@"; do printf " " printf "%s" "${arg// /\ }" done } chomp() { printf "%s" "${1/"$'\n'"/}" } ohai() { printf "${tty_blue}==>${tty_bold} %s${tty_reset}\n" "$(shell_join "$@")" } warn() { printf "${tty_red}Warning:${tty_reset} %s\n" "$(chomp "$1")" } ring_bell() { # Use the shell's audible bell. if [[ -t 1 ]]; then printf "\a" fi } getc() { local save_state save_state="$(/bin/stty -g)" /bin/stty raw -echo IFS='' read -r -n 1 -d '' "$@" /bin/stty "${save_state}" } CONFIRM_MESSAGE="Press ${tty_bold}RETURN${tty_reset}/${tty_bold}ENTER${tty_reset} to continue or any other key to abort:" wait_for_user() { local c echo echo "${CONFIRM_MESSAGE}" getc c # we test for \r and \n because some stuff does \r instead if ! [[ "${c}" == $'\r' || "${c}" == $'\n' ]]; then exit 1 fi } function have_sudo_access() { if [[ "${EUID:-"${UID}"}" == "0" ]]; then return 0 fi if [[ ! -x "/usr/bin/sudo" ]]; then return 1 fi local -a SUDO=("/usr/bin/sudo") if [[ -n "${SUDO_ASKPASS-}" ]]; then SUDO+=("-A") fi if [[ -z "${HAVE_SUDO_ACCESS-}" ]]; then ohai "Checking sudo access (press ${tty_yellow}Ctrl+C${tty_white} to list the driver versions only)." >&2 exec_cmd "${SUDO[*]} -v && ${SUDO[*]} -l mkdir &>/dev/null" HAVE_SUDO_ACCESS="$?" fi return "${HAVE_SUDO_ACCESS}" } ### Add Graphics Drivers PPA ####################################################################### # shellcheck disable=SC2310 if have_sudo_access; then exec_cmd 'sudo apt-get update' if ! (apt-cache policy | grep -qF 'graphics-drivers/ppa/ubuntu'); then exec_cmd 'sudo apt-get install software-properties-common apt-transport-https --yes' exec_cmd 'sudo add-apt-repository ppa:graphics-drivers/ppa --yes' exec_cmd 'sudo apt-get update' fi echo else DRY_RUN=1 fi ### Query and list available driver packages ####################################################### # shellcheck disable=SC2207 AVAILABLE_DRIVERS=($( apt-cache search --names-only nvidia-driver | awk '$1 ~ /^nvidia-driver-([0-9]+)$/ { print $1 }' | sort -V )) if [[ "${#AVAILABLE_DRIVERS[@]}" -eq 0 ]]; then abort "No available drivers found from APT." fi LATEST_DRIVER="${AVAILABLE_DRIVERS[-1]}" LATEST_DRIVER_VERSION="$(apt-candidate-version "${LATEST_DRIVER}")" INSTALLED_DRIVER="$(apt-list-packages | awk '$2 ~ /nvidia-driver-([0-9]+)$/ { print $2 }')" if [[ -n "${INSTALLED_DRIVER}" ]]; then INSTALLED_DRIVER_VERSION="$(apt-installed-version "${INSTALLED_DRIVER}")" INSTALLED_DRIVER_CANDIDATE_VERSION="$(apt-candidate-version "${INSTALLED_DRIVER}")" else INSTALLED_DRIVER_VERSION='' INSTALLED_DRIVER_CANDIDATE_VERSION='' fi if [[ -z "${REQUESTED_DRIVER}" ]]; then if [[ -n "${LATEST}" || -z "${INSTALLED_DRIVER}" ]]; then REQUESTED_DRIVER="${LATEST_DRIVER}" REQUESTED_DRIVER_VERSION="${LATEST_DRIVER_VERSION}" else REQUESTED_DRIVER="${INSTALLED_DRIVER}" REQUESTED_DRIVER_VERSION="${INSTALLED_DRIVER_CANDIDATE_VERSION}" fi else REQUESTED_DRIVER_VERSION="$(apt-candidate-version "${REQUESTED_DRIVER}")" if [[ -z "${REQUESTED_DRIVER_VERSION}" ]]; then abort "Unable to locate package ${REQUESTED_DRIVER}." fi fi ohai "Available NVIDIA drivers:" for driver in "${AVAILABLE_DRIVERS[@]}"; do prefix=" " if [[ "${driver}" == "${REQUESTED_DRIVER}" ]]; then prefix="--> " fi if [[ "${driver}" == "${INSTALLED_DRIVER}" ]]; then if [[ "${driver}" != "${REQUESTED_DRIVER}" ]]; then prefix="<-- " elif [[ "${REQUESTED_DRIVER_VERSION}" != "${INSTALLED_DRIVER_VERSION}" ]]; then prefix="--> " else prefix="--- " fi if [[ "${INSTALLED_DRIVER_VERSION}" == "${INSTALLED_DRIVER_CANDIDATE_VERSION}" ]]; then if [[ "${driver}" == "${LATEST_DRIVER}" ]]; then echo "${prefix}${tty_green}${driver} [${INSTALLED_DRIVER_VERSION}]${tty_reset} ${tty_yellow}[installed]${tty_reset} (up-to-date)" else echo "${prefix}${tty_bold}${driver} [${INSTALLED_DRIVER_VERSION}]${tty_reset} ${tty_yellow}[installed]${tty_reset} (up-to-date)" fi else echo "${prefix}${tty_bold}${driver} [${INSTALLED_DRIVER_VERSION}]${tty_reset} ${tty_yellow}[installed]${tty_reset} (upgradable to [${INSTALLED_DRIVER_CANDIDATE_VERSION}])" fi elif [[ "${driver}" == "${LATEST_DRIVER}" ]]; then echo "${prefix}${tty_green}${driver} [${LATEST_DRIVER_VERSION}]${tty_reset} (latest)" else echo "${prefix}${driver} [$(apt-candidate-version "${driver}")]" fi done if [[ "${INSTALLED_DRIVER}@${INSTALLED_DRIVER_VERSION}" == "${REQUESTED_DRIVER}@${REQUESTED_DRIVER_VERSION}" ]]; then echo ohai "Your NVIDIA driver is already up-to-date." exit elif [[ "${INSTALLED_DRIVER}" == "${REQUESTED_DRIVER}" && -z "${UPGRADE_ONLY}" && -z "${LATEST}" ]]; then echo ohai "The requested driver ${REQUESTED_DRIVER} is already installed. Run \`bash $(basename "$0") --upgrade-only\` to upgrade." exit elif [[ -n "${DRY_RUN}" ]]; then exit fi echo ### Show the installation plan and wait for user confirmation ###################################### if [[ -z "${INSTALLED_DRIVER}" ]]; then ohai "Install the NVIDIA driver ${REQUESTED_DRIVER} [${REQUESTED_DRIVER_VERSION}]." elif [[ "${REQUESTED_DRIVER#nvidia-driver-}" -ge "${INSTALLED_DRIVER#nvidia-driver-}" ]]; then ohai "Upgrade the NVIDIA driver from ${INSTALLED_DRIVER} [${INSTALLED_DRIVER_VERSION}] to ${REQUESTED_DRIVER} [${REQUESTED_DRIVER_VERSION}]." else ohai "Downgrade the NVIDIA driver from ${INSTALLED_DRIVER} [${INSTALLED_DRIVER_VERSION}] to ${REQUESTED_DRIVER} [${REQUESTED_DRIVER_VERSION}]." fi DM_SERVICES=() if [[ -n "$(sudo lsof -t /dev/nvidia* 2>/dev/null || true)" ]]; then for dm in gdm3 lightdm; do if service "${dm}" status &>/dev/null; then DM_SERVICES+=("${dm}") fi done fi if [[ "${#DM_SERVICES[@]}" -gt 0 ]]; then if [[ "${#DM_SERVICES[@]}" -gt 1 ]]; then warn "The following display manager services are running:" else warn "The following display manager service is running:" fi printf " - %s\n" "${DM_SERVICES[@]}" echo "The service will be stopped during the installation which may shut down the GUI desktop. The service will be restarted after the installation." fi if [[ -z "${YES}" ]]; then ring_bell CONFIRM_MESSAGE="Press ${tty_bold}RETURN${tty_reset}/${tty_bold}ENTER${tty_reset} to continue or any other key to abort:" wait_for_user fi ### Do install/upgrade the requested driver package ################################################ # Stop display manager services for dm in "${DM_SERVICES[@]}"; do exec_cmd "sudo service ${dm} stop" # shellcheck disable=SC2064 trap "exec_cmd 'sudo service ${dm} start'" EXIT # restart the service on exit done # Disable persistence mode if [[ -n "$(sudo lsmod | grep '^nvidia' | awk '{ print $1 }')" ]]; then exec_cmd "sudo nvidia-smi -pm 0 || true" fi sleep 1 # ensure the processes are stopped # Ensure no processes are using the NVIDIA devices # shellcheck disable=SC2207 PIDS=($(sudo lsof -t /dev/nvidia* 2>/dev/null || true)) if [[ "${#PIDS[@]}" -gt 0 ]]; then cat >&2 < Grafana Dashboard
The Grafana dashboard for the exporter.

nvitop-1.4.2/nvitop-exporter/dashboard.json000066400000000000000000002121161474547113600210630ustar00rootroot00000000000000{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__elements": {}, "__requires": [ { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "11.3.1" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "table", "name": "Table", "version": "" }, { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Grafana Dashboard built by `nvitop-exporter`.", "editable": false, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "links": [], "liveNow": true, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 1, "panels": [], "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 100, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 9, "w": 4, "x": 0, "y": 1 }, "id": 2, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} CPU", "range": true, "refId": "A", "useBackend": false } ], "title": "CPU Utilization", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 100, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 18, "w": 10, "x": 4, "y": 1 }, "id": 3, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Utilization", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 100, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 18, "w": 10, "x": 14, "y": 1 }, "id": 4, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_memory_percent_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Memory", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 100, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 9, "w": 4, "x": 0, "y": 10 }, "id": 5, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "host_virtual_memory_percent_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} MEM", "range": true, "refId": "A", "useBackend": false } ], "title": "Host Virtual Memory", "type": "gauge" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 }, "id": 6, "panels": [], "title": "Process", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "displayName": "occupied", "min": 0, "noValue": "0", "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "orange", "value": 80 }, { "color": "red", "value": 90 }, { "color": "dark-red", "value": 95 } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "Value #number of gpus" }, "properties": [ { "id": "displayName", "value": "total" }, { "id": "color", "value": { "fixedColor": "gray", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 4, "w": 5, "x": 0, "y": 20 }, "id": 7, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "value_and_name", "wideLayout": true }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "count(group by(uuid) (process_gpu_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}))", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "number of occupied gpus", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "count(gpu_memory_total_MiB{hostname=~\"$hostname\"})", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "number of gpus", "useBackend": false } ], "title": "Number of GPUs", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "fieldMinMax": false, "min": 0, "noValue": "No Running Processes", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 4 }, { "color": "orange", "value": 6 }, { "color": "red", "value": 8 } ] }, "unit": "none" } }, "gridPos": { "h": 8, "w": 19, "x": 5, "y": 20 }, "id": 8, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "value_and_name", "wideLayout": true }, "targets": [ { "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "count by(hostname, index) (process_gpu_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} GPU {{index}}", "range": false, "refId": "number of gpu processes", "useBackend": false } ], "title": "Number of Running GPU Processes", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "min": 0, "noValue": "No Running Processes", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 8 }, { "color": "orange", "value": 16 }, { "color": "red", "value": 32 } ] }, "unit": "none" } }, "gridPos": { "h": 4, "w": 5, "x": 0, "y": 24 }, "id": 9, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "value_and_name", "wideLayout": true }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "count by(hostname) (count by(hostname, pid) (process_info_info{hostname=~\"$hostname\", username=~\"$username\"}))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "number of gpu processes", "useBackend": false } ], "title": "Number of Running GPU Processes", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "custom": { "align": "right", "cellOptions": { "type": "auto", "wrapText": false }, "filterable": true, "inspect": false, "minWidth": 50 }, "fieldMinMax": true, "min": 0, "noValue": "No Running Processes", "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/^%.*$/" }, "properties": [ { "id": "custom.cellOptions", "value": { "mode": "lcd", "type": "gauge", "valueDisplayMode": "color" } }, { "id": "max", "value": 100 }, { "id": "unit", "value": "percent" } ] }, { "matcher": { "id": "byName", "options": "RSS MEMORY" }, "properties": [ { "id": "unit", "value": "mbytes" }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge", "valueDisplayMode": "color" } }, { "id": "custom.minWidth", "value": 150 } ] }, { "matcher": { "id": "byName", "options": "GPU MEMORY" }, "properties": [ { "id": "unit", "value": "mbytes" }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge", "valueDisplayMode": "color" } }, { "id": "custom.minWidth", "value": 150 } ] }, { "matcher": { "id": "byName", "options": "TIME" }, "properties": [ { "id": "unit", "value": "s" } ] }, { "matcher": { "id": "byName", "options": "COMMAND" }, "properties": [ { "id": "custom.align", "value": "left" } ] } ] }, "gridPos": { "h": 13, "w": 24, "x": 0, "y": 28 }, "id": 10, "options": { "cellHeight": "sm", "footer": { "countRows": false, "enablePagination": false, "fields": [ "Value #gpu memory (lastNotNull)" ], "reducer": [ "sum" ], "show": true }, "frameIndex": 0, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "GPU MEMORY" } ] }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_cpu_percent_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "interval": "", "legendFormat": "__auto", "range": false, "refId": "cpu percent", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_rss_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "rss memory", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_gpu_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu memory", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_gpu_sm_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu sm utilization", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_gpu_memory_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu memory bandwidth utilization", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_gpu_encoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu encoder utilization", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_gpu_decoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu decoder utilization", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_running_time_Second{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "interval": "", "legendFormat": "__auto", "range": false, "refId": "running time", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "process_info_info{hostname=~\"$hostname\", username=~\"$username\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "command", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "host_virtual_memory_total_MiB{hostname=~\"$hostname\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "host memory total", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_memory_total_MiB{hostname!~\"$hostname\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "__auto", "range": false, "refId": "gpu memory total", "useBackend": false } ], "title": "GPU Processes", "transformations": [ { "id": "configFromData", "options": { "applyTo": { "id": "byType", "options": "number" }, "configRefId": "host memory total", "mappings": [ { "fieldName": "Value #rss memory", "handlerKey": "max" } ] } }, { "id": "configFromData", "options": { "applyTo": { "id": "byType", "options": "number" }, "configRefId": "gpu memory total", "mappings": [ { "fieldName": "Value #gpu memory", "handlerKey": "max" } ] } }, { "id": "merge", "options": {} }, { "id": "groupBy", "options": { "fields": { "Row": { "aggregations": [], "operation": "aggregate" }, "Value #command": { "aggregations": [] }, "Value #cpu percent": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #gpu decoder utilization": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #gpu encoder utilization": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #gpu memory": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #gpu memory bandwidth utilization": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #gpu sm utilization": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #rss memory": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #running time": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "command": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "hostname": { "aggregations": [], "operation": "groupby" }, "index": { "aggregations": [], "operation": "groupby" }, "pid": { "aggregations": [], "operation": "groupby" }, "status": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "username": { "aggregations": [], "operation": "groupby" }, "uuid": { "aggregations": [], "operation": "groupby" } } } }, { "id": "organize", "options": { "excludeByName": { "uuid": true }, "includeByName": {}, "indexByName": { "Value #cpu percent (lastNotNull)": 6, "Value #gpu decoder utilization (lastNotNull)": 12, "Value #gpu encoder utilization (lastNotNull)": 11, "Value #gpu memory (lastNotNull)": 8, "Value #gpu memory bandwidth utilization (lastNotNull)": 10, "Value #gpu sm utilization (lastNotNull)": 9, "Value #rss memory (lastNotNull)": 7, "Value #running time (lastNotNull)": 13, "command (lastNotNull)": 14, "hostname": 0, "index": 3, "pid": 1, "status (lastNotNull)": 5, "username": 2, "uuid": 4 }, "renameByName": { "Value #cpu percent (lastNotNull)": "%CPU", "Value #gpu decoder utilization (lastNotNull)": "%DEC", "Value #gpu encoder utilization (lastNotNull)": "%ENC", "Value #gpu memory (lastNotNull)": "GPU MEMORY", "Value #gpu memory bandwidth utilization (lastNotNull)": "%GMBW", "Value #gpu sm utilization (lastNotNull)": "%SM", "Value #rss memory (lastNotNull)": "RSS MEMORY", "Value #running time (lastNotNull)": "TIME", "command (lastNotNull)": "COMMAND", "hostname": "HOSTNAME", "index": "DEVICE", "pid": "PID", "status (lastNotNull)": "STATUS", "username": "USERNAME" } } } ], "type": "table" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, "id": 11, "panels": [], "title": "System", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 10, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "percent" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 42 }, "id": 12, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A", "useBackend": false } ], "title": "CPU Utilization", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 10, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "unit": "percent" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 42 }, "id": 13, "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_load_average_1m_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} (1m)", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_load_average_5m_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} (5m)", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_load_average_15m_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} (15m)", "range": true, "refId": "C", "useBackend": false } ], "title": "CPU Load Average", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1024, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "unit": "mbytes" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 56 }, "id": 14, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_virtual_memory_used_MiB{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A", "useBackend": false } ], "title": "Host Virtual Memory", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1024, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "mbytes" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 56 }, "id": 15, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "host_swap_memory_used_MiB{hostname=~\"$hostname\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A", "useBackend": false } ], "title": "Host Swap Memory", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 70 }, "id": 16, "panels": [], "title": "System I/O", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 71 }, "id": 17, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(hostname) (rate(host_net_io_rx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} RX", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(hostname) (rate(host_net_io_tx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} TX", "range": true, "refId": "B", "useBackend": false } ], "title": "Host Network I/O", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 71 }, "id": 18, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(hostname) (rate(host_disk_io_read_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} Read", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(hostname) (rate(host_disk_io_write_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} Write", "range": true, "refId": "B", "useBackend": false } ], "title": "Host Disk I/O", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 85 }, "id": 19, "panels": [], "title": "Device", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 86 }, "id": 20, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Utilization", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1024, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "mbytes" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 86 }, "id": 21, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_memory_used_MiB{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Memory", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 100 }, "id": 22, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU PCIe RX Throughput", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 100 }, "id": 23, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU PCIe TX Throughput", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 114 }, "id": 24, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_nvlink_total_rx_throughput_MiBps{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU NVLink RX Throughput", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "min": 0, "unit": "MiBs" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 114 }, "id": 25, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_nvlink_total_tx_throughput_MiBps{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU NVLink TX Throughput", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "unit": "watt" } }, "gridPos": { "h": 14, "w": 12, "x": 0, "y": 128 }, "id": 26, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_power_usage_W{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Power Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "unit": "celsius" } }, "gridPos": { "h": 14, "w": 12, "x": 12, "y": 128 }, "id": 27, "options": { "legend": { "calcs": [ "lastNotNull", "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Name", "sortDesc": false }, "tooltip": { "maxHeight": 600, "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "gpu_temperature_C{hostname=~\"$hostname\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{hostname}} GPU {{index}}", "range": true, "refId": "A", "useBackend": false } ], "title": "GPU Temperature", "type": "timeseries" } ], "preload": true, "refresh": "5s", "schemaVersion": 40, "tags": [ "nvitop", "nvitop-exporter", "prometheus", "nvidia", "gpu", "gpu process", "gpu monitoring" ], "templating": { "list": [ { "current": {}, "definition": "label_values(hostname)", "description": "", "includeAll": true, "multi": true, "name": "hostname", "options": [], "query": { "qryType": 1, "query": "label_values(hostname)" }, "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "definition": "label_values(username)", "description": "", "includeAll": true, "multi": true, "name": "username", "options": [], "query": { "qryType": 1, "query": "label_values(username)" }, "refresh": 1, "regex": "", "type": "query" } ] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "nvitop-dashboard", "uid": "bdl3vqwxprhtsa", "version": 1, "weekStart": "" } nvitop-1.4.2/nvitop-exporter/nvitop_exporter/000077500000000000000000000000001474547113600215055ustar00rootroot00000000000000nvitop-1.4.2/nvitop-exporter/nvitop_exporter/__init__.py000066400000000000000000000017731474547113600236260ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Prometheus exporter built on top of ``nvitop``.""" from nvitop_exporter.exporter import PrometheusExporter from nvitop_exporter.utils import get_ip_address from nvitop_exporter.version import __version__ __all__ = ['PrometheusExporter', 'get_ip_address'] nvitop-1.4.2/nvitop-exporter/nvitop_exporter/__main__.py000066400000000000000000000016201474547113600235760ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Prometheus exporter built on top of ``nvitop``.""" import sys from nvitop_exporter.cli import main if __name__ == '__main__': sys.exit(main()) nvitop-1.4.2/nvitop-exporter/nvitop_exporter/cli.py000066400000000000000000000164221474547113600226330ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Prometheus exporter built on top of ``nvitop``.""" from __future__ import annotations import argparse import sys from typing import TextIO from prometheus_client import start_wsgi_server import nvitop from nvitop import Device, colored, libnvml from nvitop_exporter.exporter import PrometheusExporter from nvitop_exporter.utils import get_ip_address from nvitop_exporter.version import __version__ def cprint(text: str = '', *, file: TextIO | None = None) -> None: """Print colored text to a file.""" for prefix, color in ( ('INFO: ', 'yellow'), ('WARNING: ', 'yellow'), ('ERROR: ', 'red'), ('NVML ERROR: ', 'red'), ): if text.startswith(prefix): text = text.replace( prefix.rstrip(), colored(prefix.rstrip(), color=color, attrs=('bold',)), # type: ignore[arg-type] 1, ) print(text, file=file) def parse_arguments() -> argparse.Namespace: """Parse command-line arguments for ``nvitop-exporter``.""" def posfloat(argstring: str) -> float: num = float(argstring) if num <= 0: raise ValueError return num posfloat.__name__ = 'positive float' parser = argparse.ArgumentParser( prog='nvitop-exporter', description='Prometheus exporter built on top of `nvitop`.', formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument( '--help', '-h', dest='help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.', ) parser.add_argument( '--version', '-V', dest='version', action='version', version=f'%(prog)s {__version__} (nvitop {nvitop.__version__})', help="Show %(prog)s's version number and exit.", ) parser.add_argument( '--hostname', '--host', '-H', dest='hostname', type=str, default=get_ip_address(), metavar='HOSTNAME', help='Hostname to display in the exporter. (default: %(default)s)', ) parser.add_argument( '--bind-address', '--bind', '-B', dest='bind_address', type=str, default='127.0.0.1', metavar='ADDRESS', help='Local address to bind to. (default: %(default)s)', ) parser.add_argument( '--port', '-p', type=int, default=8000, help='Port to listen on. (default: %(default)d)', ) parser.add_argument( '--interval', dest='interval', type=posfloat, default=1.0, metavar='SEC', help='Interval between updates in seconds. (default: %(default)s)', ) args = parser.parse_args() if args.interval < 0.25: parser.error( f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. ' f'Expected 1/4 or higher.', ) return args def main() -> int: # pylint: disable=too-many-locals,too-many-statements """Main function for ``nvitop-exporter`` CLI.""" args = parse_arguments() try: device_count = Device.count() except libnvml.NVMLError_LibraryNotFound: return 1 except libnvml.NVMLError as ex: cprint(f'NVML ERROR: {ex}', file=sys.stderr) return 1 if device_count == 0: cprint('NVML ERROR: No NVIDIA devices found.', file=sys.stderr) return 1 physical_devices = Device.from_indices(range(device_count)) mig_devices = [] for device in physical_devices: mig_devices.extend(device.mig_devices()) cprint( 'INFO: Found {}{}.'.format( colored(str(device_count), color='green', attrs=('bold',)), ( ' physical device(s) and {} MIG device(s)'.format( colored(str(len(mig_devices)), color='blue', attrs=('bold',)), ) if mig_devices else ' device(s)' ), ), file=sys.stderr, ) devices = sorted( physical_devices + mig_devices, # type: ignore[operator] key=lambda d: (d.index,) if isinstance(d.index, int) else d.index, ) for device in devices: name = device.name() uuid = device.uuid() if device.is_mig_device(): name = name.rpartition(' ')[-1] cprint( f'INFO: MIG {name:<11} Device {device.mig_index:>2d}: (UUID: {uuid})', file=sys.stderr, ) else: cprint(f'INFO: GPU {device.index}: {name} (UUID: {uuid})', file=sys.stderr) exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval) try: start_wsgi_server(port=args.port, addr=args.bind_address) except OSError as ex: if 'address already in use' in str(ex).lower(): cprint( ( 'ERROR: Address {} is already in use. ' 'Please specify a different port via `--port `.' ).format( colored( f'http://{args.bind_address}:{args.port}', color='blue', attrs=('bold', 'underline'), ), ), file=sys.stderr, ) elif 'cannot assign requested address' in str(ex).lower(): cprint( ( 'ERROR: Cannot assign requested address at {}. ' 'Please specify a different address via `--bind-address
`.' ).format( colored( f'http://{args.bind_address}:{args.port}', color='blue', attrs=('bold', 'underline'), ), ), file=sys.stderr, ) else: cprint(f'ERROR: {ex}', file=sys.stderr) return 1 cprint( 'INFO: Start the exporter on {} at {}.'.format( colored(args.hostname, color='magenta', attrs=('bold',)), colored( f'http://{args.bind_address}:{args.port}/metrics', color='green', attrs=('bold', 'underline'), ), ), file=sys.stderr, ) try: exporter.collect() except KeyboardInterrupt: cprint(file=sys.stderr) cprint('INFO: Interrupted by user.', file=sys.stderr) return 0 if __name__ == '__main__': sys.exit(main()) nvitop-1.4.2/nvitop-exporter/nvitop_exporter/exporter.py000066400000000000000000000703011474547113600237300ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Prometheus exporter built on top of ``nvitop``.""" from __future__ import annotations import math import time from typing import Sequence from prometheus_client import REGISTRY, CollectorRegistry, Gauge, Info from nvitop import Device, GpuProcess, MiB, MigDevice, PhysicalDevice, host from nvitop_exporter.utils import get_ip_address class PrometheusExporter: # pylint: disable=too-many-instance-attributes """Prometheus exporter built on top of ``nvitop``.""" def __init__( # pylint: disable=too-many-statements self, devices: Sequence[Device], hostname: str | None = None, *, registry: CollectorRegistry = REGISTRY, interval: float = 1.0, ) -> None: """Initialize the Prometheus exporter.""" if not isinstance(devices, (list, tuple)): raise TypeError(f'Expected a list or tuple of devices, got {type(devices)}') devices = list(devices) for device in devices: if not isinstance(device, (PhysicalDevice, MigDevice)): raise TypeError(f'Expected a PhysicalDevice or MigDevice, got {type(device)}') self.devices = devices self.hostname = hostname or get_ip_address() self.registry = registry self.interval = interval self.alive_pids: dict[Device, set[tuple[int, str]]] = { device: set() for device in self.devices } self.info = Info( 'nvitop', documentation='NVITOP Prometheus Exporter.', labelnames=['hostname'], registry=self.registry, ) self.info.labels(hostname=self.hostname).info( { 'device_count': str(Device.count()), 'driver_version': Device.driver_version(), 'cuda_driver_version': Device.cuda_driver_version(), }, ) # Create gauges for host metrics self.host_uptime = Gauge( name='host_uptime', documentation='Host uptime (s).', unit='Second', labelnames=['hostname'], registry=self.registry, ) self.host_cpu_percent = Gauge( name='host_cpu_percent', documentation='Host CPU percent (%).', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_virtual_memory_total = Gauge( name='host_virtual_memory_total', documentation='Host virtual memory total (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_virtual_memory_used = Gauge( name='host_virtual_memory_used', documentation='Host virtual memory used (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_virtual_memory_free = Gauge( name='host_virtual_memory_free', documentation='Host virtual memory free (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_virtual_memory_percent = Gauge( name='host_virtual_memory_percent', documentation='Host virtual memory percent (%).', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_swap_memory_total = Gauge( name='host_swap_memory_total', documentation='Host swap total (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_swap_memory_used = Gauge( name='host_swap_memory_used', documentation='Host swap used (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_swap_memory_free = Gauge( name='host_swap_memory_free', documentation='Host swap free (MiB).', unit='MiB', labelnames=['hostname'], registry=self.registry, ) self.host_swap_memory_percent = Gauge( name='host_swap_memory_percent', documentation='Host swap percent (%).', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_load_average_1m = Gauge( name='host_load_average_1m', documentation='Host load average for the last minute.', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_load_average_5m = Gauge( name='host_load_average_5m', documentation='Host load average for the last 5 minutes.', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_load_average_15m = Gauge( name='host_load_average_15m', documentation='Host load average for the last 15 minutes.', unit='Percentage', labelnames=['hostname'], registry=self.registry, ) self.host_net_io_tx_data = Gauge( name='host_net_io_tx_data', documentation='Host network I/O transmitted data (MiB).', unit='MiB', labelnames=['hostname', 'interface'], registry=self.registry, ) self.host_net_io_rx_data = Gauge( name='host_net_io_rx_data', documentation='Host network I/O received data (MiB).', unit='MiB', labelnames=['hostname', 'interface'], registry=self.registry, ) self.host_net_io_tx_packets = Gauge( name='host_net_io_tx_packets', documentation='Host network I/O transmitted packets.', unit='Packet', labelnames=['hostname', 'interface'], registry=self.registry, ) self.host_net_io_rx_packets = Gauge( name='host_net_io_rx_packets', documentation='Host network I/O received packets.', unit='Packet', labelnames=['hostname', 'interface'], registry=self.registry, ) self.host_disk_io_read_data = Gauge( name='host_disk_io_read_data', documentation='Host disk I/O read data (MiB).', unit='MiB', labelnames=['hostname', 'partition'], registry=self.registry, ) self.host_disk_io_write_data = Gauge( name='host_disk_io_write_data', documentation='Host disk I/O write data (MiB).', unit='MiB', labelnames=['hostname', 'partition'], registry=self.registry, ) self.host_disk_usage_total = Gauge( name='host_disk_usage_total', documentation='Host disk usage total (MiB).', unit='MiB', labelnames=['hostname', 'mountpoint'], registry=self.registry, ) self.host_disk_usage_used = Gauge( name='host_disk_usage_used', documentation='Host disk usage used (MiB).', unit='MiB', labelnames=['hostname', 'mountpoint'], registry=self.registry, ) self.host_disk_usage_free = Gauge( name='host_disk_usage_free', documentation='Host disk usage free (MiB).', unit='MiB', labelnames=['hostname', 'mountpoint'], registry=self.registry, ) self.host_disk_usage_percent = Gauge( name='host_disk_usage_percent', documentation='Host disk usage percent (%).', unit='Percentage', labelnames=['hostname', 'mountpoint'], registry=self.registry, ) # Create gauges for GPU metrics self.gpu_utilization = Gauge( name='gpu_utilization', documentation='GPU utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_memory_utilization = Gauge( name='gpu_memory_utilization', documentation='GPU memory utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_encoder_utilization = Gauge( name='gpu_encoder_utilization', documentation='GPU encoder utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_decoder_utilization = Gauge( name='gpu_decoder_utilization', documentation='GPU decoder utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_memory_total = Gauge( name='gpu_memory_total', documentation='GPU memory total (MiB).', unit='MiB', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_memory_used = Gauge( name='gpu_memory_used', documentation='GPU memory used (MiB).', unit='MiB', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_memory_free = Gauge( name='gpu_memory_free', documentation='GPU memory free (MiB).', unit='MiB', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_memory_percent = Gauge( name='gpu_memory_percent', documentation='GPU memory percent (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_clock_sm = Gauge( name='gpu_clock_sm', documentation='GPU SM clock (MHz).', unit='MHz', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_clock_memory = Gauge( name='gpu_clock_memory', documentation='GPU memory clock (MHz).', unit='MHz', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_clock_graphics = Gauge( name='gpu_clock_graphics', documentation='GPU graphics clock (MHz).', unit='MHz', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_clock_video = Gauge( name='gpu_clock_video', documentation='GPU video clock (MHz).', unit='MHz', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_power_usage = Gauge( name='gpu_power_usage', documentation='GPU power usage (W).', unit='W', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_power_limit = Gauge( name='gpu_power_limit', documentation='GPU power limit (W).', unit='W', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_temperature = Gauge( name='gpu_temperature', documentation='GPU temperature (C).', unit='C', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_fan_speed = Gauge( name='gpu_fan_speed', documentation='GPU fan speed (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_pcie_tx_throughput = Gauge( name='gpu_pcie_tx_throughput', documentation='GPU PCIe transmit throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_pcie_rx_throughput = Gauge( name='gpu_pcie_rx_throughput', documentation='GPU PCIe receive throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_nvlink_total_tx_throughput = Gauge( name='gpu_nvlink_total_tx_throughput', documentation='GPU total NVLink transmit throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_nvlink_total_rx_throughput = Gauge( name='gpu_nvlink_total_rx_throughput', documentation='GPU total NVLink receive throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_nvlink_mean_tx_throughput = Gauge( name='gpu_nvlink_mean_tx_throughput', documentation='GPU mean NVLink transmit throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_nvlink_mean_rx_throughput = Gauge( name='gpu_nvlink_mean_rx_throughput', documentation='GPU mean NVLink receive throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid'], registry=self.registry, ) self.gpu_nvlink_tx_throughput = Gauge( name='gpu_nvlink_tx_throughput', documentation='GPU NVLink transmit throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], registry=self.registry, ) self.gpu_nvlink_rx_throughput = Gauge( name='gpu_nvlink_rx_throughput', documentation='GPU NVLink receive throughput (MiB/s).', unit='MiBps', labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], registry=self.registry, ) # Create gauges for process metrics self.process_info = Info( name='process_info', documentation='Process information.', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_running_time = Gauge( name='process_running_time', documentation='Process running time (s).', unit='Second', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_cpu_percent = Gauge( name='process_cpu_percent', documentation='Process CPU percent (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_rss_memory = Gauge( name='process_rss_memory', documentation='Process memory resident set size (MiB).', unit='MiB', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_memory_percent = Gauge( name='process_memory_percent', documentation='Process memory percent (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_gpu_memory = Gauge( name='process_gpu_memory', documentation='Process GPU memory (MiB).', unit='MiB', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_gpu_sm_utilization = Gauge( name='process_gpu_sm_utilization', documentation='Process GPU SM utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_gpu_memory_utilization = Gauge( name='process_gpu_memory_utilization', documentation='Process GPU memory utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_gpu_encoder_utilization = Gauge( name='process_gpu_encoder_utilization', documentation='Process GPU encoder utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) self.process_gpu_decoder_utilization = Gauge( name='process_gpu_decoder_utilization', documentation='Process GPU decoder utilization (%).', unit='Percentage', labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], registry=self.registry, ) def collect(self) -> None: """Collect metrics.""" while True: next_update_time = time.monotonic() + self.interval self.update_host() for device in self.devices: self.update_device(device) time.sleep(max(0.0, next_update_time - time.monotonic())) def update_host(self) -> None: """Update metrics for the host.""" load_average = host.load_average() if load_average is None: load_average = (0.0, 0.0, 0.0) # type: ignore[unreachable] virtual_memory = host.virtual_memory() swap_memory = host.swap_memory() net_io_counters = host.net_io_counters(pernic=True) # type: ignore[attr-defined] disk_io_counters = host.disk_io_counters(perdisk=True) # type: ignore[attr-defined] for gauge, value in ( (self.host_uptime, host.uptime()), (self.host_cpu_percent, host.cpu_percent()), (self.host_virtual_memory_total, virtual_memory.total / MiB), (self.host_virtual_memory_used, virtual_memory.used / MiB), (self.host_virtual_memory_free, virtual_memory.free / MiB), (self.host_virtual_memory_percent, virtual_memory.percent), (self.host_swap_memory_total, swap_memory.total / MiB), (self.host_swap_memory_used, swap_memory.used / MiB), (self.host_swap_memory_free, swap_memory.free / MiB), (self.host_swap_memory_percent, swap_memory.percent), (self.host_load_average_1m, load_average[0]), (self.host_load_average_5m, load_average[1]), (self.host_load_average_15m, load_average[2]), ): gauge.labels(self.hostname).set(value) for interface, net_io_counter in net_io_counters.items(): for gauge, value in ( (self.host_net_io_tx_data, net_io_counter.bytes_sent / MiB), (self.host_net_io_rx_data, net_io_counter.bytes_recv / MiB), (self.host_net_io_tx_packets, net_io_counter.packets_sent), (self.host_net_io_rx_packets, net_io_counter.packets_recv), ): gauge.labels(hostname=self.hostname, interface=interface).set(value) for partition, disk_io_counter in disk_io_counters.items(): for gauge, value in ( (self.host_disk_io_read_data, disk_io_counter.read_bytes / MiB), (self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB), ): gauge.labels(hostname=self.hostname, partition=partition).set(value) for partition in host.disk_partitions(): # type: ignore[attr-defined] try: partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined] except (OSError, host.PsutilError): continue for gauge, value in ( (self.host_disk_usage_total, partition_usage.total / MiB), (self.host_disk_usage_used, partition_usage.used / MiB), (self.host_disk_usage_free, partition_usage.free / MiB), (self.host_disk_usage_percent, partition_usage.percent), ): gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value) def update_device(self, device: Device) -> None: # pylint: disable=too-many-locals """Update metrics for a single device.""" index = ( str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index)) ) name = device.name() uuid = device.uuid() with device.oneshot(): for gauge, value in ( (self.gpu_utilization, float(device.gpu_utilization())), (self.gpu_memory_utilization, float(device.memory_utilization())), (self.gpu_encoder_utilization, float(device.encoder_utilization())), (self.gpu_decoder_utilization, float(device.decoder_utilization())), (self.gpu_memory_total, device.memory_total() / MiB), (self.gpu_memory_used, device.memory_used() / MiB), (self.gpu_memory_free, device.memory_free() / MiB), (self.gpu_memory_percent, float(device.memory_percent())), (self.gpu_clock_sm, float(device.clock_infos().sm)), (self.gpu_clock_memory, float(device.clock_infos().memory)), (self.gpu_clock_graphics, float(device.clock_infos().graphics)), (self.gpu_clock_video, float(device.clock_infos().video)), (self.gpu_power_usage, device.power_usage() / 1000.0), (self.gpu_power_limit, device.power_limit() / 1000.0), (self.gpu_temperature, float(device.temperature())), (self.gpu_fan_speed, float(device.fan_speed())), (self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0), (self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0), (self.gpu_nvlink_total_tx_throughput, device.nvlink_total_tx_throughput() / 1024.0), (self.gpu_nvlink_total_rx_throughput, device.nvlink_total_rx_throughput() / 1024.0), (self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0), (self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0), ): gauge.labels( hostname=self.hostname, index=index, devicename=name, uuid=uuid, ).set(value) for gauge, nvlink_throughput in ( (self.gpu_nvlink_tx_throughput, device.nvlink_tx_throughput()), (self.gpu_nvlink_rx_throughput, device.nvlink_rx_throughput()), ): for link, throughput in enumerate(nvlink_throughput): gauge.labels( hostname=self.hostname, index=index, devicename=name, uuid=uuid, link=link, ).set(throughput / 1024.0) alive_pids = self.alive_pids[device] previous_alive_pids = alive_pids.copy() alive_pids.clear() with GpuProcess.failsafe(): host_snapshots = {} for pid, process in device.processes().items(): with process.oneshot(): username = process.username() if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100 host_snapshot = host_snapshots[pid, username] = process.host_snapshot() else: host_snapshot = host_snapshots[pid, username] self.process_info.labels( hostname=self.hostname, index=index, devicename=name, uuid=uuid, pid=pid, username=username, ).info( { 'status': host_snapshot.status, 'command': host_snapshot.command, }, ) for gauge, value in ( ( self.process_running_time, ( host_snapshot.running_time.total_seconds() if host_snapshot.running_time else math.nan ), ), (self.process_cpu_percent, host_snapshot.cpu_percent), (self.process_rss_memory, host_snapshot.host_memory / MiB), (self.process_memory_percent, float(host_snapshot.memory_percent)), (self.process_gpu_memory, process.gpu_memory() / MiB), ( self.process_gpu_sm_utilization, float(process.gpu_sm_utilization()), ), ( self.process_gpu_memory_utilization, float(process.gpu_memory_utilization()), ), ( self.process_gpu_encoder_utilization, float(process.gpu_encoder_utilization()), ), ( self.process_gpu_decoder_utilization, float(process.gpu_decoder_utilization()), ), ): gauge.labels( hostname=self.hostname, index=index, devicename=name, uuid=uuid, pid=pid, username=username, ).set(value) alive_pids.update(host_snapshots) for pid, username in previous_alive_pids.difference(alive_pids): for collector in ( self.process_info, self.process_running_time, self.process_cpu_percent, self.process_rss_memory, self.process_memory_percent, self.process_gpu_memory, self.process_gpu_sm_utilization, self.process_gpu_memory_utilization, self.process_gpu_encoder_utilization, self.process_gpu_decoder_utilization, ): try: collector.remove( self.hostname, index, name, uuid, pid, username, ) except KeyError: # noqa: PERF203 pass nvitop-1.4.2/nvitop-exporter/nvitop_exporter/utils.py000066400000000000000000000025131474547113600232200ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utility functions for ``nvitop-exporter``.""" import socket __all__ = ['get_ip_address'] # Reference: https://stackoverflow.com/a/28950776 def get_ip_address() -> str: """Get the IP address of the current machine.""" s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.settimeout(0.0) try: # Doesn't even have to be reachable s.connect(('10.254.254.254', 1)) ip_address = s.getsockname()[0] except Exception: # noqa: BLE001 # pylint: disable=broad-except ip_address = '127.0.0.1' finally: s.close() return ip_address nvitop-1.4.2/nvitop-exporter/nvitop_exporter/version.py000066400000000000000000000036231474547113600235500ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Prometheus exporter built on top of ``nvitop``.""" # pylint: disable=invalid-name __version__ = '1.4.2' __license__ = 'Apache-2.0' __author__ = __maintainer__ = 'Xuehai Pan' __email__ = 'XuehaiPan@pku.edu.cn' __release__ = False if not __release__: import os import subprocess try: prefix, sep, suffix = ( subprocess.check_output( # noqa: S603 ['git', 'describe', '--abbrev=7'], # noqa: S607 cwd=os.path.dirname(os.path.abspath(__file__)), stderr=subprocess.DEVNULL, text=True, ) .strip() .lstrip('v') .replace('-', '.dev', 1) .replace('-', '+', 1) .partition('.dev') ) if sep: version_prefix, dot, version_tail = prefix.rpartition('.') prefix = f'{version_prefix}{dot}{int(version_tail) + 1}' __version__ = f'{prefix}{sep}{suffix}' del version_prefix, dot, version_tail else: __version__ = prefix del prefix, sep, suffix except (OSError, subprocess.CalledProcessError): pass del os, subprocess nvitop-1.4.2/nvitop-exporter/pyproject.toml000066400000000000000000000041441474547113600211550ustar00rootroot00000000000000[build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] name = "nvitop-exporter" description = "Prometheus exporter built on top of `nvitop`." readme = "README.md" requires-python = ">= 3.7" authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }] license = { text = "Apache License, Version 2.0 (Apache-2.0)" } keywords = [ "nvidia", "nvidia-smi", "NVIDIA", "NVML", "CUDA", "GPU", "top", "monitoring", "prometheus", "Prometheus", "grafana", "Grafana", ] classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Environment :: GPU", "Environment :: GPU :: NVIDIA CUDA", "Intended Audience :: Developers", "Intended Audience :: End Users/Desktop", "Intended Audience :: System Administrators", "Topic :: System :: Hardware", "Topic :: System :: Monitoring", "Topic :: System :: Systems Administration", "Topic :: Utilities", ] dependencies = [ # Sync with nvitop/version.py and requirements.txt "nvitop ~= 1.4.2", "prometheus-client >= 0.4.0", ] dynamic = ["version"] [project.scripts] nvitop-exporter = "nvitop_exporter.cli:main" [project.urls] Homepage = "https://github.com/XuehaiPan/nvitop" Repository = "https://github.com/XuehaiPan/nvitop" Documentation = "https://nvitop.readthedocs.io" "Bug Report" = "https://github.com/XuehaiPan/nvitop/issues" [tool.setuptools.packages.find] include = ["nvitop_exporter", "nvitop_exporter.*"] [tool.black] safe = true line-length = 100 skip-string-normalization = true target-version = ["py37"] [tool.ruff] extend = "../pyproject.toml" nvitop-1.4.2/nvitop-exporter/requirements.txt000066400000000000000000000000421474547113600215160ustar00rootroot00000000000000nvitop prometheus-client >= 0.4.0 nvitop-1.4.2/nvitop-exporter/setup.py000077500000000000000000000034741474547113600177630ustar00rootroot00000000000000#!/usr/bin/env python3 """Setup script for ``nvitop-exporter``.""" from __future__ import annotations import contextlib import re import sys from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path from typing import TYPE_CHECKING, Generator from setuptools import setup if TYPE_CHECKING: from types import ModuleType HERE = Path(__file__).absolute().parent @contextlib.contextmanager def vcs_version(name: str, path: Path | str) -> Generator[ModuleType]: """Context manager to update version string in a version module.""" path = Path(path).absolute() assert path.is_file() module_spec = spec_from_file_location(name=name, location=path) assert module_spec is not None assert module_spec.loader is not None module = sys.modules.get(name) if module is None: module = module_from_spec(module_spec) sys.modules[name] = module module_spec.loader.exec_module(module) if module.__release__: yield module return content = None try: try: content = path.read_text(encoding='utf-8') path.write_text( data=re.sub( r"""__version__\s*=\s*('[^']+'|"[^"]+")""", f'__version__ = {module.__version__!r}', string=content, ), encoding='utf-8', ) except OSError: content = None yield module finally: if content is not None: with path.open(mode='wt', encoding='utf-8', newline='') as file: file.write(content) with vcs_version( name='nvitop_exporter.version', path=(HERE / 'nvitop_exporter' / 'version.py'), ) as version: setup( name='nvitop-exporter', version=version.__version__, ) nvitop-1.4.2/nvitop/000077500000000000000000000000001474547113600143705ustar00rootroot00000000000000nvitop-1.4.2/nvitop/__init__.py000066400000000000000000000032041474547113600165000ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management.""" import sys from nvitop import api from nvitop.api import * # noqa: F403 from nvitop.api import ( caching, collector, device, host, libcuda, libcudart, libnvml, process, termcolor, utils, ) from nvitop.select import select_devices from nvitop.version import __version__ __all__ = [*api.__all__, 'select_devices'] # Add submodules to the top-level namespace for submodule in ( caching, collector, device, host, libcuda, libcudart, libnvml, process, termcolor, utils, ): sys.modules[f'{__name__}.{submodule.__name__.rpartition(".")[-1]}'] = submodule # Remove the nvitop.select module from sys.modules # Required for `python -m nvitop.select` to work properly sys.modules.pop(f'{__name__}.select', None) del sys nvitop-1.4.2/nvitop/__main__.py000066400000000000000000000003651474547113600164660ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. """The interactive NVIDIA-GPU process viewer.""" import sys from nvitop.cli import main if __name__ == '__main__': sys.exit(main()) nvitop-1.4.2/nvitop/api/000077500000000000000000000000001474547113600151415ustar00rootroot00000000000000nvitop-1.4.2/nvitop/api/LICENSE000066400000000000000000000261501474547113600161520ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021-2025 Xuehai Pan. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. nvitop-1.4.2/nvitop/api/__init__.py000066400000000000000000000051321474547113600172530ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """The core APIs of nvitop.""" from nvitop.api import ( caching, collector, device, host, libcuda, libcudart, libnvml, process, termcolor, utils, ) from nvitop.api.caching import ttl_cache from nvitop.api.collector import ResourceMetricCollector, collect_in_background, take_snapshots from nvitop.api.device import ( CudaDevice, CudaMigDevice, Device, MigDevice, PhysicalDevice, normalize_cuda_visible_devices, parse_cuda_visible_devices, ) from nvitop.api.libnvml import NVMLError, nvmlCheckReturn from nvitop.api.process import GpuProcess, HostProcess, command_join from nvitop.api.utils import ( # explicitly export these to appease mypy NA, SIZE_UNITS, UINT_MAX, ULONGLONG_MAX, GiB, KiB, MiB, NaType, NotApplicable, NotApplicableType, PiB, Snapshot, TiB, boolify, bytes2human, colored, human2bytes, set_color, timedelta2human, utilization2string, ) __all__ = [ 'NVMLError', 'nvmlCheckReturn', 'libnvml', 'libcuda', 'libcudart', # nvitop.api.device 'Device', 'PhysicalDevice', 'MigDevice', 'CudaDevice', 'CudaMigDevice', 'parse_cuda_visible_devices', 'normalize_cuda_visible_devices', # nvitop.api.process 'host', 'HostProcess', 'GpuProcess', 'command_join', # nvitop.api.collector 'take_snapshots', 'collect_in_background', 'ResourceMetricCollector', # nvitop.api.caching 'ttl_cache', # nvitop.api.utils 'NA', 'NaType', 'NotApplicable', 'NotApplicableType', 'UINT_MAX', 'ULONGLONG_MAX', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'SIZE_UNITS', 'bytes2human', 'human2bytes', 'timedelta2human', 'utilization2string', 'colored', 'set_color', 'boolify', 'Snapshot', ] nvitop-1.4.2/nvitop/api/caching.py000066400000000000000000000243041474547113600171120ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Caching utilities.""" from __future__ import annotations import builtins import functools import time from threading import RLock from typing import TYPE_CHECKING, Any, NamedTuple, overload if TYPE_CHECKING: from collections.abc import Callable, Hashable, Sized from collections.abc import Set as AbstractSet from typing import TypeVar from typing_extensions import ( ParamSpec, # Python 3.10+ Self, # Python 3.11+ ) _P = ParamSpec('_P') _T = TypeVar('_T') __all__ = ['ttl_cache'] class _CacheInfo(NamedTuple): """A named tuple representing the cache statistics.""" hits: int misses: int maxsize: int currsize: int try: from functools import _make_key except ImportError: class _HashedSeq(list): """This class guarantees that hash() will be called no more than once per element.""" __slots__ = ('__hashvalue',) def __init__( self, seq: tuple[Any, ...], hash: Callable[[Any], int] = builtins.hash, # pylint: disable=redefined-builtin ) -> None: """Initialize the hashed sequence.""" self[:] = seq self.__hashvalue = hash(seq) def __hash__(self) -> int: # type: ignore[override] """Return the hash value of the hashed sequence.""" return self.__hashvalue _KWD_MARK = object() # pylint: disable-next=too-many-arguments def _make_key( # type: ignore[misc] args: tuple[Hashable, ...], kwds: dict[str, Hashable], typed: bool, *, kwd_mark: tuple[object, ...] = (_KWD_MARK,), fasttypes: AbstractSet[type] = frozenset({int, str}), tuple: type[tuple] = builtins.tuple, # pylint: disable=redefined-builtin type: type[type] = builtins.type, # pylint: disable=redefined-builtin len: Callable[[Sized], int] = builtins.len, # pylint: disable=redefined-builtin ) -> Hashable: """Make a cache key from optionally typed positional and keyword arguments.""" key = args if kwds: key += kwd_mark for item in kwds.items(): key += item if typed: key += tuple(type(v) for v in args) if kwds: key += tuple(type(v) for v in kwds.values()) elif len(key) == 1 and type(key[0]) in fasttypes: return key[0] return _HashedSeq(key) class _TTLCacheLink: # pylint: disable=too-few-public-methods __slots__ = ('expires', 'key', 'next', 'prev', 'value') # pylint: disable-next=too-many-arguments,too-many-positional-arguments def __init__( self, prev: Self | None, next: Self | None, # pylint: disable=redefined-builtin key: Hashable, value: Any, expires: float | None, ) -> None: self.prev: Self = prev # type: ignore[assignment] self.next: Self = next # type: ignore[assignment] self.key: Hashable = key self.value: Any = value self.expires: float = expires # type: ignore[assignment] @overload def ttl_cache( maxsize: int | None = 128, ttl: float = 600.0, timer: Callable[[], float] = time.monotonic, typed: bool = False, ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]: ... @overload def ttl_cache( maxsize: Callable[_P, _T], ttl: float = 600.0, timer: Callable[[], float] = time.monotonic, typed: bool = False, ) -> Callable[_P, _T]: ... # pylint: disable-next=too-many-statements def ttl_cache( maxsize: int | Callable[_P, _T] | None = 128, ttl: float = 600.0, timer: Callable[[], float] = time.monotonic, typed: bool = False, ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]] | Callable[_P, _T]: """Time aware cache decorator.""" if isinstance(maxsize, int): # Negative maxsize is treated as 0 maxsize = max(0, maxsize) elif callable(maxsize) and isinstance(typed, bool): # The user_function was passed in directly via the maxsize argument func, maxsize = maxsize, 128 return ttl_cache(maxsize, ttl=ttl, timer=timer, typed=typed)(func) elif maxsize is not None: raise TypeError('Expected first argument to be an integer, a callable, or None') if ttl < 0.0: raise ValueError('TTL must be a non-negative number') if not callable(timer): raise TypeError('Timer must be a callable') if maxsize == 0 or maxsize is None: return functools.lru_cache(maxsize=maxsize, typed=typed) # type: ignore[return-value] # pylint: disable-next=too-many-statements,too-many-locals def wrapper(func: Callable[_P, _T]) -> Callable[_P, _T]: cache: dict[Any, _TTLCacheLink] = {} cache_get = cache.get # bound method to lookup a key or return None cache_len = cache.__len__ # get cache size without calling len() lock = RLock() # because linked-list updates aren't thread-safe root = _TTLCacheLink(*((None,) * 5)) # root of the circular doubly linked list root.prev = root.next = root # initialize by pointing to self hits = misses = 0 full = False def unlink(link: _TTLCacheLink) -> _TTLCacheLink: with lock: link_prev, link_next = link.prev, link.next link_next.prev, link_prev.next = link_prev, link_next return link_next def append(link: _TTLCacheLink) -> _TTLCacheLink: with lock: last = root.prev last.next = root.prev = link link.prev, link.next = last, root return link def move_to_end(link: _TTLCacheLink) -> _TTLCacheLink: with lock: unlink(link) append(link) return link def expire() -> None: nonlocal full with lock: now = timer() front = root.next while front is not root and front.expires < now: del cache[front.key] front = unlink(front) full = cache_len() >= maxsize @functools.wraps(func) def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _T: # Size limited time aware caching nonlocal root, hits, misses, full key = _make_key(args, kwargs, typed) with lock: link = cache_get(key) if link is not None: if timer() < link.expires: hits += 1 return link.value expire() misses += 1 result = func(*args, **kwargs) expires = timer() + ttl with lock: if key in cache: # Getting here means that this same key was added to the cache while the lock # was released or the key was expired. Move the link to the front of the # circular queue. link = move_to_end(cache[key]) # We need only update the expiration time. link.value = result link.expires = expires else: if full: expire() if full: # Use the old root to store the new key and result. root.key = key root.value = result root.expires = expires # Empty the oldest link and make it the new root. # Keep a reference to the old key and old result to prevent their ref counts # from going to zero during the update. That will prevent potentially # arbitrary object clean-up code (i.e. __del__) from running while we're # still adjusting the links. front = root.next old_key = front.key front.key = front.value = front.expires = None # type: ignore[assignment] # Now update the cache dictionary. del cache[old_key] # Save the potentially reentrant cache[key] assignment for last, after the # root and links have been put in a consistent state. cache[key], root = root, front else: # Put result in a new link at the front of the queue. cache[key] = append(_TTLCacheLink(None, None, key, result, expires)) full = cache_len() >= maxsize return result def cache_info() -> _CacheInfo: """Report cache statistics.""" with lock: expire() return _CacheInfo(hits, misses, maxsize, cache_len()) def cache_clear() -> None: """Clear the cache and cache statistics.""" nonlocal hits, misses, full with lock: cache.clear() root.prev = root.next = root root.key = root.value = root.expires = None # type: ignore[assignment] hits = misses = 0 full = False wrapped.cache_info = cache_info # type: ignore[attr-defined] wrapped.cache_clear = cache_clear # type: ignore[attr-defined] wrapped.cache_parameters = lambda: {'maxsize': maxsize, 'typed': typed} # type: ignore[attr-defined] return wrapped return wrapper nvitop-1.4.2/nvitop/api/collector.py000066400000000000000000001054121474547113600175040ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Resource metrics collectors.""" from __future__ import annotations import contextlib import itertools import math import os import threading import time from collections import OrderedDict, defaultdict from typing import TYPE_CHECKING, ClassVar, NamedTuple, TypeVar from weakref import WeakSet from nvitop.api import host from nvitop.api.device import CudaDevice, Device from nvitop.api.process import GpuProcess, HostProcess from nvitop.api.utils import GiB, MiB, Snapshot if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable __all__ = ['take_snapshots', 'collect_in_background', 'ResourceMetricCollector'] class SnapshotResult(NamedTuple): # pylint: disable=missing-class-docstring devices: list[Snapshot] gpu_processes: list[Snapshot] timer = time.monotonic _T = TypeVar('_T') def _unique(iterable: Iterable[_T]) -> list[_T]: return list(OrderedDict.fromkeys(iterable).keys()) # pylint: disable-next=too-many-branches def take_snapshots( devices: Device | Iterable[Device] | None = None, *, gpu_processes: bool | GpuProcess | Iterable[GpuProcess] | None = None, ) -> SnapshotResult: """Retrieve status of demanded devices and GPU processes. Args: devices (Optional[Union[Device, Iterable[Device]]]): Requested devices for snapshots. If not given, the devices will be determined from GPU processes: **(1)** All devices (no GPU processes are given); **(2)** Devices that used by given GPU processes. gpu_processes (Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]]): Requested GPU processes snapshots. If not given, all GPU processes running on the requested device will be returned. The GPU process snapshots can be suppressed by specifying ``gpu_processes=False``. Returns: SnapshotResult A named tuple containing two lists of snapshots. Note: If not arguments are specified, all devices and all GPU processes will be returned. Examples: >>> from nvitop import take_snapshots, Device >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '1,0' >>> take_snapshots() # equivalent to `take_snapshots(Device.all())` SnapshotResult( devices=[ PhysicalDeviceSnapshot( real=PhysicalDevice(index=0, ...), ... ), ... ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=PhysicalDevice(index=0, ...), ...), ... ), ... ] ) >>> device_snapshots, gpu_process_snapshots = take_snapshots(Device.all()) # type: Tuple[List[DeviceSnapshot], List[GpuProcessSnapshot]] >>> device_snapshots, _ = take_snapshots(gpu_processes=False) # ignore process snapshots >>> take_snapshots(Device.cuda.all()) # use CUDA device enumeration SnapshotResult( devices=[ CudaDeviceSnapshot( real=CudaDevice(cuda_index=0, physical_index=1, ...), ... ), CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, physical_index=0, ...), ... ), ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=0, ...), ...), ... ), ... ] ) >>> take_snapshots(Device.cuda(1)) # only SnapshotResult( devices=[ CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, physical_index=0, ...), ... ) ], gpu_processes=[ GpuProcessSnapshot( real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=1, ...), ...), ... ), ... ] ) """ # pylint: disable=line-too-long if isinstance(devices, Device): devices = [devices] if isinstance(gpu_processes, GpuProcess): gpu_processes = [gpu_processes] if gpu_processes is not None and gpu_processes is not True: if gpu_processes: # is a non-empty list/tuple gpu_processes = list(gpu_processes) process_devices = _unique(process.device for process in gpu_processes) for device in process_devices: device.processes() # update GPU status for requested GPU processes if devices is None: devices = process_devices else: gpu_processes = [] # False or empty list/tuple if devices is None: devices = Device.all() else: if devices is None: physical_devices = Device.all() devices = [] leaf_devices: list[Device] = [] for physical_device in physical_devices: devices.append(physical_device) mig_devices = physical_device.mig_devices() if len(mig_devices) > 0: devices.extend(mig_devices) leaf_devices.extend(mig_devices) else: leaf_devices.append(physical_device) else: leaf_devices = devices = list(devices) gpu_processes = list( itertools.chain.from_iterable(device.processes().values() for device in leaf_devices), ) devices = [device.as_snapshot() for device in devices] # type: ignore[union-attr] gpu_processes = GpuProcess.take_snapshots(gpu_processes, failsafe=True) return SnapshotResult(devices, gpu_processes) # pylint: disable-next=too-many-arguments def collect_in_background( on_collect: Callable[[dict[str, float]], bool], collector: ResourceMetricCollector | None = None, interval: float | None = None, *, on_start: Callable[[ResourceMetricCollector], None] | None = None, on_stop: Callable[[ResourceMetricCollector], None] | None = None, tag: str = 'metrics-daemon', start: bool = True, ) -> threading.Thread: """Start a background daemon thread that collect and call the callback function periodically. See also :func:`ResourceMetricCollector.daemonize`. Args: on_collect (Callable[[Dict[str, float]], bool]): A callback function that will be called periodically. It takes a dictionary containing the resource metrics and returns a boolean indicating whether to continue monitoring. collector (Optional[ResourceMetricCollector]): A :class:`ResourceMetricCollector` instance to collect metrics. If not given, it will collect metrics for all GPUs and subprocess of the current process. interval (Optional[float]): The collect interval. If not given, use ``collector.interval``. on_start (Optional[Callable[[ResourceMetricCollector], None]]): A function to initialize the daemon thread and collector. on_stop (Optional[Callable[[ResourceMetricCollector], None]]): A function that do some necessary cleanup after the daemon thread is stopped. tag (str): The tag prefix used for metrics results. start (bool): Whether to start the daemon thread on return. Returns: threading.Thread A daemon thread object. Examples: .. code-block:: python logger = ... def on_collect(metrics): # will be called periodically if logger.is_closed(): # closed manually by user return False logger.log(metrics) return True def on_stop(collector): # will be called only once at stop if not logger.is_closed(): logger.close() # cleanup # Record metrics to the logger in the background every 5 seconds. # It will collect 5-second mean/min/max for each metric. collect_in_background( on_collect, ResourceMetricCollector(Device.cuda.all()), interval=5.0, on_stop=on_stop, ) """ if collector is None: collector = ResourceMetricCollector() if isinstance(interval, (int, float)) and interval > 0: interval = float(interval) elif interval is None: interval = collector.interval else: raise ValueError(f'Invalid argument interval={interval!r}') def target() -> None: if on_start is not None: on_start(collector) # type: ignore[arg-type] try: with collector(tag): # type: ignore[misc] try: next_snapshot = timer() + interval # type: ignore[operator] while on_collect(collector.collect()): # type: ignore[union-attr] time.sleep(max(0.0, next_snapshot - timer())) next_snapshot += interval # type: ignore[operator] except KeyboardInterrupt: pass finally: if on_stop is not None: on_stop(collector) # type: ignore[arg-type] daemon = threading.Thread(target=target, name=tag, daemon=True) daemon.collector = collector # type: ignore[attr-defined] if start: daemon.start() return daemon class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes """A class for collecting resource metrics. Args: devices (Iterable[Device]): Set of Device instances for logging. If not given, all physical devices on board will be used. root_pids (Set[int]): A set of PIDs, only the status of the descendant processes on the GPUs will be collected. If not given, the PID of the current process will be used. interval (float): The snapshot interval for background daemon thread. Core methods: .. code-block:: python collector.activate(tag='') # alias: start collector.deactivate() # alias: stop collector.clear(tag='') collector.collect() with collector(tag=''): ... collector.daemonize(on_collect_fn) Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' >>> from nvitop import ResourceMetricCollector, Device >>> collector = ResourceMetricCollector() # log all devices and descendant processes of the current process on the GPUs >>> collector = ResourceMetricCollector(root_pids={1}) # log all devices and all GPU processes >>> collector = ResourceMetricCollector(devices=Device.cuda.all()) # use the CUDA ordinal >>> with collector(tag=''): ... # Do something ... collector.collect() # -> Dict[str, float] # key -> '///' { '/host/cpu_percent (%)/mean': 8.967849777683456, '/host/cpu_percent (%)/min': 6.1, '/host/cpu_percent (%)/max': 28.1, ..., '/host/memory_percent (%)/mean': 21.5, '/host/swap_percent (%)/mean': 0.3, '/host/memory_used (GiB)/mean': 91.0136418208109, '/host/load_average (%) (1 min)/mean': 10.251427386878328, '/host/load_average (%) (5 min)/mean': 10.072539414569503, '/host/load_average (%) (15 min)/mean': 11.91126970422139, ..., '/cuda:0 (gpu:3)/memory_used (MiB)/mean': 3.875, '/cuda:0 (gpu:3)/memory_free (MiB)/mean': 11015.562499999998, '/cuda:0 (gpu:3)/memory_total (MiB)/mean': 11019.437500000002, '/cuda:0 (gpu:3)/memory_percent (%)/mean': 0.0, '/cuda:0 (gpu:3)/gpu_utilization (%)/mean': 0.0, '/cuda:0 (gpu:3)/memory_utilization (%)/mean': 0.0, '/cuda:0 (gpu:3)/fan_speed (%)/mean': 22.0, '/cuda:0 (gpu:3)/temperature (C)/mean': 25.0, '/cuda:0 (gpu:3)/power_usage (W)/mean': 19.11166264116916, ..., '/cuda:1 (gpu:2)/memory_used (MiB)/mean': 8878.875, ..., '/cuda:2 (gpu:1)/memory_used (MiB)/mean': 8182.875, ..., '/cuda:3 (gpu:0)/memory_used (MiB)/mean': 9286.875, ..., '/pid:12345/host/cpu_percent (%)/mean': 151.34342772112265, '/pid:12345/host/host_memory (MiB)/mean': 44749.72373447514, '/pid:12345/host/host_memory_percent (%)/mean': 8.675082352111717, '/pid:12345/host/running_time (min)': 336.23803206741576, '/pid:12345/cuda:1 (gpu:4)/gpu_memory (MiB)/mean': 8861.0, '/pid:12345/cuda:1 (gpu:4)/gpu_memory_percent (%)/mean': 80.4, '/pid:12345/cuda:1 (gpu:4)/gpu_memory_utilization (%)/mean': 6.711118172407917, '/pid:12345/cuda:1 (gpu:4)/gpu_sm_utilization (%)/mean': 48.23283397736476, ..., '/duration (s)': 7.247399162035435, '/timestamp': 1655909466.9981883 } """ # pylint: disable=line-too-long DEVICE_METRICS: ClassVar[list[tuple[str, str, float | int]]] = [ # (, , ) # GPU memory metrics ('memory_used', 'memory_used (MiB)', MiB), ('memory_free', 'memory_free (MiB)', MiB), ('memory_total', 'memory_total (MiB)', MiB), ('memory_percent', 'memory_percent (%)', 1.0), # GPU utilization metrics ('gpu_utilization', 'gpu_utilization (%)', 1.0), ('memory_utilization', 'memory_utilization (%)', 1.0), # Miscellaneous ('fan_speed', 'fan_speed (%)', 1.0), ('temperature', 'temperature (C)', 1.0), ('power_usage', 'power_usage (W)', 1000.0), ] PROCESS_METRICS: ClassVar[list[tuple[str, str | None, str, float | int]]] = [ # (, , , ) # Host resource metrics ('cpu_percent', 'host', 'cpu_percent (%)', 1.0), ('host_memory', 'host', 'host_memory (MiB)', MiB), ('host_memory_percent', 'host', 'host_memory_percent (%)', 1.0), ('running_time_in_seconds', 'host', 'running_time (min)', 60.0), # GPU memory metrics ('gpu_memory', None, 'gpu_memory (MiB)', MiB), ('gpu_memory_percent', None, 'gpu_memory_percent (%)', 1.0), ('gpu_memory_utilization', None, 'gpu_memory_utilization (%)', 1.0), # GPU utilization metrics ('gpu_sm_utilization', None, 'gpu_sm_utilization (%)', 1.0), ] def __init__( self, devices: Iterable[Device] | None = None, root_pids: Iterable[int] | None = None, interval: float = 1.0, ) -> None: """Initialize the resource metric collector.""" if isinstance(interval, (int, float)) and interval > 0: interval = float(interval) else: raise ValueError(f'Invalid argument interval={interval!r}') if devices is None: devices = Device.all() root_pids: set[int] = {os.getpid()} if root_pids is None else set(root_pids) self.interval: float = interval self.devices: list[Device] = list(devices) self.all_devices: list[Device] = [] self.leaf_devices: list[Device] = [] for device in self.devices: self.all_devices.append(device) mig_devices = device.mig_devices() if len(mig_devices) > 0: self.all_devices.extend(mig_devices) self.leaf_devices.extend(mig_devices) else: self.leaf_devices.append(device) self.root_pids: set[int] = root_pids self._positive_processes: WeakSet[HostProcess] = WeakSet( HostProcess(pid) for pid in self.root_pids ) self._negative_processes: WeakSet[HostProcess] = WeakSet() self._last_timestamp: float = timer() - 2.0 * self.interval self._lock: threading.RLock = threading.RLock() self._metric_buffer: _MetricBuffer | None = None self._tags: set[str] = set() self._daemon: threading.Thread = threading.Thread( name='metrics-collector-daemon', target=self._target, daemon=True, ) self._daemon_running: threading.Event = threading.Event() def activate(self, tag: str) -> ResourceMetricCollector: """Start a new metric collection with the given tag. Args: tag (str): The name of the new metric collection. The tag will be used to identify the metric collection. It must be a unique string. Examples: >>> collector = ResourceMetricCollector() >>> collector.activate(tag='train') # key prefix -> 'train' >>> collector.activate(tag='batch') # key prefix -> 'train/batch' >>> collector.deactivate() # key prefix -> 'train' >>> collector.deactivate() # the collector has been stopped >>> collector.activate(tag='test') # key prefix -> 'test' """ with self._lock: if self._metric_buffer is None or tag not in self._tags: self._tags.add(tag) self._metric_buffer = _MetricBuffer(tag, self, prev=self._metric_buffer) self._last_timestamp = timer() - 2.0 * self.interval else: raise RuntimeError(f'Resource metric collector is already started with tag "{tag}"') self._daemon_running.set() try: self._daemon.start() except RuntimeError: pass return self start = activate def deactivate(self, tag: str | None = None) -> ResourceMetricCollector: """Stop the current collection with the given tag and remove all sub-tags. If the tag is not specified, deactivate the current active collection. For nested collections, the sub-collections will be deactivated as well. Args: tag (Optional[str]): The tag to deactivate. If :data:`None`, the current active collection will be used. """ with self._lock: if self._metric_buffer is None: if tag is not None: raise RuntimeError('Resource metric collector has not been started yet.') return self if tag is None: tag = self._metric_buffer.tag elif tag not in self._tags: raise RuntimeError( f'Resource metric collector has not been started with tag "{tag}".', ) buffer = self._metric_buffer while True: self._tags.remove(buffer.tag) if buffer.tag == tag: self._metric_buffer = buffer.prev break buffer = buffer.prev # type: ignore[assignment] if self._metric_buffer is None: self._daemon_running.clear() return self stop = deactivate @contextlib.contextmanager def context(self, tag: str) -> Generator[ResourceMetricCollector]: """A context manager for starting and stopping resource metric collection. Args: tag (str): The name of the new metric collection. The tag will be used to identify the metric collection. It must be a unique string. Examples: >>> collector = ResourceMetricCollector() >>> with collector.context(tag='train'): # key prefix -> 'train' ... # Do something ... collector.collect() # -> Dict[str, float] """ try: self.activate(tag=tag) yield self finally: self.deactivate(tag=tag) __call__ = context # alias for `with collector(tag='')` def clear(self, tag: str | None = None) -> None: """Clear the metric collection with the given tag. If the tag is not specified, clear the current active collection. For nested collections, the sub-collections will be cleared as well. Args: tag (Optional[str]): The tag to clear. If :data:`None`, the current active collection will be reset. Examples: >>> collector = ResourceMetricCollector() >>> with collector(tag='train'): # key prefix -> 'train' ... time.sleep(5.0) ... collector.collect() # metrics within the 5.0s interval ... ... time.sleep(5.0) ... collector.collect() # metrics within the cumulative 10.0s interval ... ... collector.clear() # clear the active collection ... time.sleep(5.0) ... collector.collect() # metrics within the 5.0s interval ... ... with collector(tag='batch'): # key prefix -> 'train/batch' ... collector.clear(tag='train') # clear both 'train' and 'train/batch' """ with self._lock: if self._metric_buffer is None: if tag is not None: raise RuntimeError('Resource metric collector has not been started yet.') return if tag is None: tag = self._metric_buffer.tag elif tag not in self._tags: raise RuntimeError( f'Resource metric collector has not been started with tag "{tag}".', ) buffer = self._metric_buffer while True: buffer.clear() if buffer.tag == tag: break buffer = buffer.prev # type: ignore[assignment] reset = clear def collect(self) -> dict[str, float]: """Get the average resource consumption during collection.""" with self._lock: if self._metric_buffer is None: raise RuntimeError('Resource metric collector has not been started yet.') if timer() - self._last_timestamp > self.interval / 2.0: self.take_snapshots() return self._metric_buffer.collect() # pylint: disable-next=too-many-arguments def daemonize( self, on_collect: Callable[[dict[str, float]], bool], interval: float | None = None, *, on_start: Callable[[ResourceMetricCollector], None] | None = None, on_stop: Callable[[ResourceMetricCollector], None] | None = None, tag: str = 'metrics-daemon', start: bool = True, ) -> threading.Thread: """Start a background daemon thread that collect and call the callback function periodically. See also :func:`collect_in_background`. Args: on_collect (Callable[[Dict[str, float]], bool]): A callback function that will be called periodically. It takes a dictionary containing the resource metrics and returns a boolean indicating whether to continue monitoring. interval (Optional[float]): The collect interval. If not given, use ``collector.interval``. on_start (Optional[Callable[[ResourceMetricCollector], None]]): A function to initialize the daemon thread and collector. on_stop (Optional[Callable[[ResourceMetricCollector], None]]): A function that do some necessary cleanup after the daemon thread is stopped. tag (str): The tag prefix used for metrics results. start (bool): Whether to start the daemon thread on return. Returns: threading.Thread A daemon thread object. Examples: .. code-block:: python logger = ... def on_collect(metrics): # will be called periodically if logger.is_closed(): # closed manually by user return False logger.log(metrics) return True def on_stop(collector): # will be called only once at stop if not logger.is_closed(): logger.close() # cleanup # Record metrics to the logger in the background every 5 seconds. # It will collect 5-second mean/min/max for each metric. ResourceMetricCollector(Device.cuda.all()).daemonize( on_collect, ResourceMetricCollector(Device.cuda.all()), interval=5.0, on_stop=on_stop, ) """ return collect_in_background( on_collect, collector=self, interval=interval, on_start=on_start, on_stop=on_stop, tag=tag, start=start, ) def __del__(self) -> None: """Clean up the demon thread on destruction.""" self._daemon_running.clear() # pylint: disable-next=too-many-branches,too-many-locals,too-many-statements def take_snapshots(self) -> SnapshotResult: """Take snapshots of the current resource metrics and update the metric buffer.""" if len(self.root_pids) > 0: all_gpu_processes: list[GpuProcess] = [] for device in self.leaf_devices: all_gpu_processes.extend(device.processes().values()) gpu_processes = [] for process in all_gpu_processes: if process.host in self._negative_processes: continue positive = True if process.host not in self._positive_processes: positive = False p = process.host parents = [] while p is not None: parents.append(p) if p in self._positive_processes: positive = True break try: p = p.parent() # type: ignore[assignment] except host.PsutilError: break if positive: self._positive_processes.update(parents) else: self._negative_processes.update(parents) if positive: gpu_processes.append(process) else: gpu_processes = [] timestamp = timer() epoch_timestamp = time.time() metrics = {} device_snapshots = [device.as_snapshot() for device in self.all_devices] gpu_process_snapshots = GpuProcess.take_snapshots(gpu_processes, failsafe=True) metrics.update( { 'host/cpu_percent (%)': host.cpu_percent(), 'host/memory_percent (%)': host.memory_percent(), 'host/swap_percent (%)': host.swap_percent(), 'host/memory_used (GiB)': host.virtual_memory().used / GiB, }, ) load_average = host.load_average() if load_average is not None: metrics.update( { 'host/load_average (%) (1 min)': load_average[0], 'host/load_average (%) (5 min)': load_average[1], 'host/load_average (%) (15 min)': load_average[2], }, ) device_identifiers = {} for device_snapshot in device_snapshots: identifier = f'gpu:{device_snapshot.index}' if isinstance(device_snapshot.real, CudaDevice): identifier = f'cuda:{device_snapshot.cuda_index} ({identifier})' device_identifiers[device_snapshot.real] = identifier for attr, name, unit in self.DEVICE_METRICS: value = float(getattr(device_snapshot, attr)) / unit metrics[f'{identifier}/{name}'] = value for process_snapshot in gpu_process_snapshots: device_identifier = device_identifiers[process_snapshot.device] identifier = f'pid:{process_snapshot.pid}' for attr, scope, name, unit in self.PROCESS_METRICS: scope = scope or device_identifier value = float(getattr(process_snapshot, attr)) / unit metrics[f'{identifier}/{scope}/{name}'] = value with self._lock: if self._metric_buffer is not None: self._metric_buffer.add( metrics, timestamp=timestamp, epoch_timestamp=epoch_timestamp, ) self._last_timestamp = timestamp return SnapshotResult(device_snapshots, gpu_process_snapshots) def _target(self) -> None: self._daemon_running.wait() while self._daemon_running.is_set(): next_snapshot = timer() + self.interval self.take_snapshots() time.sleep(max(0.0, next_snapshot - timer())) next_snapshot += self.interval class _MetricBuffer: # pylint: disable=missing-class-docstring,missing-function-docstring,too-many-instance-attributes def __init__( self, tag: str, collector: ResourceMetricCollector, prev: _MetricBuffer | None = None, ) -> None: self.collector: ResourceMetricCollector = collector self.prev: _MetricBuffer | None = prev self.tag: str = tag self.key_prefix: str if self.prev is not None: self.key_prefix = f'{self.prev.key_prefix}/{self.tag}' else: self.key_prefix = self.tag self.last_timestamp = self.start_timestamp = timer() self.last_epoch_timestamp = time.time() self.buffer: defaultdict[str, _StatisticsMaintainer] = defaultdict( lambda: _StatisticsMaintainer(self.last_timestamp), ) self.len = 0 def add( self, metrics: dict[str, float], timestamp: float | None = None, epoch_timestamp: float | None = None, ) -> None: if timestamp is None: timestamp = timer() if epoch_timestamp is None: epoch_timestamp = time.time() for key in set(self.buffer).difference(metrics): self.buffer[key].add(math.nan, timestamp=timestamp) for key, value in metrics.items(): self.buffer[key].add(value, timestamp=timestamp) self.len += 1 self.last_timestamp = timestamp self.last_epoch_timestamp = epoch_timestamp if self.prev is not None: self.prev.add(metrics, timestamp=timestamp) def clear(self) -> None: self.last_timestamp = self.start_timestamp = timer() self.last_epoch_timestamp = time.time() self.buffer.clear() self.len = 0 def collect(self) -> dict[str, float]: metrics = { f'{self.key_prefix}/{key}/{name}': value for key, stats in self.buffer.items() for name, value in stats.items() } for key in tuple(metrics.keys()): if key.endswith('host/running_time (min)/max'): metrics[key[:-4]] = metrics[key] del metrics[key] elif key.endswith(('host/running_time (min)/mean', 'host/running_time (min)/min')): del metrics[key] metrics[f'{self.key_prefix}/duration (s)'] = timer() - self.start_timestamp metrics[f'{self.key_prefix}/timestamp'] = time.time() metrics[f'{self.key_prefix}/last_timestamp'] = self.last_epoch_timestamp return metrics def __len__(self) -> int: return self.len class _StatisticsMaintainer: # pylint: disable=missing-class-docstring,missing-function-docstring def __init__(self, timestamp: float) -> None: self.start_timestamp: float = timestamp self.last_timestamp: float = math.nan self.integral: float | None = None self.last_value: float | None = None self.min_value: float | None = None self.max_value: float | None = None self.has_nan: bool = False def add(self, value: float, timestamp: float | None = None) -> None: if timestamp is None: timestamp = timer() if math.isnan(value): self.has_nan = True return if self.last_value is None: self.integral = value * (timestamp - self.start_timestamp) self.last_value = self.min_value = self.max_value = value else: # pylint: disable-next=line-too-long self.integral += (value + self.last_value) * (timestamp - self.last_timestamp) / 2.0 # type: ignore[operator] self.last_value = value self.min_value = min(self.min_value, value) # type: ignore[type-var] self.max_value = max(self.max_value, value) # type: ignore[type-var] self.last_timestamp = timestamp def mean(self) -> float: if self.integral is None: return math.nan if self.has_nan: return self.integral / (self.last_timestamp - self.start_timestamp) timestamp = timer() integral = self.integral + self.last_value * (timestamp - self.last_timestamp) # type: ignore[operator] return integral / (timestamp - self.start_timestamp) def min(self) -> float: if self.min_value is None: return math.nan return self.min_value def max(self) -> float: if self.max_value is None: return math.nan return self.max_value def last(self) -> float: if self.last_value is None: return math.nan return self.last_value def items(self) -> Iterable[tuple[str, float]]: yield ('mean', self.mean()) yield ('min', self.min()) yield ('max', self.max()) yield ('last', self.last()) nvitop-1.4.2/nvitop/api/device.py000066400000000000000000004170421474547113600167620ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """The live classes for GPU devices. The core classes are :class:`Device` and :class:`CudaDevice` (also aliased as :attr:`Device.cuda`). The type of returned instance created by ``Class(args)`` is depending on the given arguments. ``Device()`` returns: .. code-block:: python - (index: int) -> PhysicalDevice - (index: (int, int)) -> MigDevice - (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value - (bus_id: str) -> PhysicalDevice ``CudaDevice()`` returns: .. code-block:: python - (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (nvml_index: int) -> CudaDevice - (nvml_index: (int, int)) -> CudaMigDevice Examples: >>> from nvitop import Device, CudaDevice >>> Device.driver_version() # version of the installed NVIDIA display driver '470.129.06' >>> Device.count() # number of NVIDIA GPUs in the system 10 >>> Device.all() # all physical devices in the system [ PhysicalDevice(index=0, ...), PhysicalDevice(index=1, ...), ... ] >>> nvidia0 = Device(index=0) # -> PhysicalDevice >>> mig10 = Device(index=(1, 0)) # -> MigDevice >>> nvidia2 = Device(uuid='GPU-xxxxxx') # -> PhysicalDevice >>> mig30 = Device(uuid='MIG-xxxxxx') # -> MigDevice >>> nvidia0.memory_free() # total free memory in bytes 11550654464 >>> nvidia0.memory_free_human() # total free memory in human readable format '11016MiB' >>> nvidia2.as_snapshot() # takes an onetime snapshot of the device PhysicalDeviceSnapshot( real=PhysicalDevice(index=2, ...), ... ) >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' >>> CudaDevice.count() # number of NVIDIA GPUs visible to CUDA applications 4 >>> Device.cuda.count() # use alias in class `Device` 4 >>> CudaDevice.all() # all CUDA visible devices (or `Device.cuda.all()`) [ CudaDevice(cuda_index=0, nvml_index=3, ...), CudaDevice(cuda_index=1, nvml_index=2, ...), ... ] >>> cuda0 = CudaDevice(cuda_index=0) # use CUDA ordinal (or `Device.cuda(0)`) >>> cuda1 = CudaDevice(nvml_index=2) # use NVML ordinal >>> cuda2 = CudaDevice(uuid='GPU-xxxxxx') # use UUID string >>> cuda0.memory_free() # total free memory in bytes 11550654464 >>> cuda0.memory_free_human() # total free memory in human readable format '11016MiB' >>> cuda1.as_snapshot() # takes an onetime snapshot of the device CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, nvml_index=2, ...), ... ) """ # pylint: disable=too-many-lines from __future__ import annotations import contextlib import functools import multiprocessing as mp import os import re import subprocess import sys import textwrap import threading import time from collections import OrderedDict from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple, overload from nvitop.api import libcuda, libcudart, libnvml from nvitop.api.process import GpuProcess from nvitop.api.utils import ( NA, UINT_MAX, NaType, Snapshot, boolify, bytes2human, memoize_when_activated, ) if TYPE_CHECKING: from collections.abc import Callable, Generator, Hashable, Iterable from typing_extensions import ( Literal, # Python 3.8+ Self, # Python 3.11+ ) __all__ = [ 'Device', 'PhysicalDevice', 'MigDevice', 'CudaDevice', 'CudaMigDevice', 'parse_cuda_visible_devices', 'normalize_cuda_visible_devices', ] # Class definitions ################################################################################ class MemoryInfo(NamedTuple): # in bytes # pylint: disable=missing-class-docstring total: int | NaType free: int | NaType used: int | NaType class ClockInfos(NamedTuple): # in MHz # pylint: disable=missing-class-docstring graphics: int | NaType sm: int | NaType memory: int | NaType video: int | NaType class ClockSpeedInfos(NamedTuple): # pylint: disable=missing-class-docstring current: ClockInfos max: ClockInfos class UtilizationRates(NamedTuple): # in percentage # pylint: disable=missing-class-docstring gpu: int | NaType memory: int | NaType encoder: int | NaType decoder: int | NaType class ThroughputInfo(NamedTuple): # in KiB/s # pylint: disable=missing-class-docstring tx: int | NaType rx: int | NaType @property def transmit(self) -> int | NaType: """Alias of :attr:`tx`.""" return self.tx @property def receive(self) -> int | NaType: """Alias of :attr:`rx`.""" return self.rx # pylint: disable-next=missing-class-docstring,too-few-public-methods class ValueOmitted: def __repr__(self) -> str: return '' _VALUE_OMITTED: str = ValueOmitted() # type: ignore[assignment] del ValueOmitted class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods """Live class of the GPU devices, different from the device snapshots. :meth:`Device.__new__()` returns different types depending on the given arguments. .. code-block:: python - (index: int) -> PhysicalDevice - (index: (int, int)) -> MigDevice - (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value - (bus_id: str) -> PhysicalDevice Examples: >>> Device.driver_version() # version of the installed NVIDIA display driver '470.129.06' >>> Device.count() # number of NVIDIA GPUs in the system 10 >>> Device.all() # all physical devices in the system [ PhysicalDevice(index=0, ...), PhysicalDevice(index=1, ...), ... ] >>> nvidia0 = Device(index=0) # -> PhysicalDevice >>> mig10 = Device(index=(1, 0)) # -> MigDevice >>> nvidia2 = Device(uuid='GPU-xxxxxx') # -> PhysicalDevice >>> mig30 = Device(uuid='MIG-xxxxxx') # -> MigDevice >>> nvidia0.memory_free() # total free memory in bytes 11550654464 >>> nvidia0.memory_free_human() # total free memory in human readable format '11016MiB' >>> nvidia2.as_snapshot() # takes an onetime snapshot of the device PhysicalDeviceSnapshot( real=PhysicalDevice(index=2, ...), ... ) Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given index is a tuple but is not consist of two integers. """ # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices # GPU UUID : `GPU-` # MIG UUID : `MIG-GPU-//` # MIG UUID (R470+): `MIG-` UUID_PATTERN: re.Pattern = re.compile( r"""^ # full match (?:(?PMIG)-)? # prefix for MIG UUID (?:(?PGPU)-)? # prefix for GPU UUID (?(MigMode)|(?(GpuUuid)|GPU-)) # always have a prefix (?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}) # UUID for the GPU/MIG device in lower case # Suffix for MIG device while using GPU UUID with GPU instance (GI) ID and compute instance (CI) ID (?(MigMode) # match only when the MIG prefix matches (?(GpuUuid) # match only when provide with GPU UUID /(?P\d+) # GI ID of the MIG device /(?P\d+) # CI ID of the MIG device |) |) $""", # full match flags=re.VERBOSE, ) GPU_PROCESS_CLASS: type[GpuProcess] = GpuProcess cuda: type[CudaDevice] = None # type: ignore[assignment] # defined in below """Shortcut for class :class:`CudaDevice`.""" _nvml_index: int | tuple[int, int] @classmethod def is_available(cls) -> bool: """Test whether there are any devices and the NVML library is successfully loaded.""" try: return cls.count() > 0 except libnvml.NVMLError: return False @staticmethod def driver_version() -> str | NaType: """The version of the installed NVIDIA display driver. This is an alphanumeric string. Command line equivalent: .. code:: bash nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=driver_version Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ return libnvml.nvmlQuery('nvmlSystemGetDriverVersion') @staticmethod def cuda_driver_version() -> str | NaType: """The maximum CUDA version supported by the NVIDIA display driver. This is an alphanumeric string. This can be different from the version of the CUDA Runtime. See also :meth:`cuda_runtime_version`. Returns: Union[str, NaType] The maximum CUDA version supported by the NVIDIA display driver. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion') if libnvml.nvmlCheckReturn(cuda_driver_version, int): major = cuda_driver_version // 1000 minor = (cuda_driver_version % 1000) // 10 revision = cuda_driver_version % 10 if revision == 0: return f'{major}.{minor}' return f'{major}.{minor}.{revision}' return NA max_cuda_version = cuda_driver_version @staticmethod def cuda_runtime_version() -> str | NaType: """The CUDA Runtime version. This is an alphanumeric string. This can be different from the CUDA driver version. See also :meth:`cuda_driver_version`. Returns: Union[str, NaType] The CUDA Runtime version, or :const:`nvitop.NA` when no CUDA Runtime is available or no CUDA-capable devices are present. """ try: return libcudart.cudaRuntimeGetVersion() except libcudart.cudaError: return NA cudart_version = cuda_runtime_version @classmethod def count(cls) -> int: """The number of NVIDIA GPUs in the system. Command line equivalent: .. code:: bash nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=count Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0) @classmethod def all(cls) -> list[PhysicalDevice]: """Return a list of all physical devices in the system.""" return cls.from_indices() # type: ignore[return-value] @classmethod def from_indices( cls, indices: int | Iterable[int | tuple[int, int]] | None = None, ) -> list[PhysicalDevice | MigDevice]: """Return a list of devices of the given indices. Args: indices (Iterable[Union[int, Tuple[int, int]]]): Indices of the devices. For each index, get :class:`PhysicalDevice` for single int and :class:`MigDevice` for tuple (int, int). That is: - (int) -> PhysicalDevice - ((int, int)) -> MigDevice Returns: List[Union[PhysicalDevice, MigDevice]] A list of :class:`PhysicalDevice` and/or :class:`MigDevice` instances of the given indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ if indices is None: try: indices = range(cls.count()) except libnvml.NVMLError: return [] if isinstance(indices, int): indices = [indices] return list(map(cls, indices)) # type: ignore[arg-type] @staticmethod def from_cuda_visible_devices() -> list[CudaDevice]: """Return a list of all CUDA visible devices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Returns: List[CudaDevice] A list of :class:`CudaDevice` instances. """ # pylint: disable=line-too-long visible_device_indices = Device.parse_cuda_visible_devices() device_index: int | tuple[int, int] cuda_devices: list[CudaDevice] = [] for cuda_index, device_index in enumerate(visible_device_indices): # type: ignore[assignment] cuda_devices.append(CudaDevice(cuda_index, nvml_index=device_index)) return cuda_devices @staticmethod def from_cuda_indices(cuda_indices: int | Iterable[int] | None = None) -> list[CudaDevice]: """Return a list of CUDA devices of the given CUDA indices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: cuda_indices (Iterable[int]): The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices. Returns: List[CudaDevice] A list of :class:`CudaDevice` of the given CUDA indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long cuda_devices = Device.from_cuda_visible_devices() if cuda_indices is None: return cuda_devices if isinstance(cuda_indices, int): cuda_indices = [cuda_indices] cuda_indices = list(cuda_indices) cuda_device_count = len(cuda_devices) devices = [] for cuda_index in cuda_indices: if not 0 <= cuda_index < cuda_device_count: raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') device = cuda_devices[cuda_index] devices.append(device) return devices @staticmethod def parse_cuda_visible_devices( cuda_visible_devices: str | None = _VALUE_OMITTED, ) -> list[int] | list[tuple[int, int]]: """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. This is a alias of :func:`parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: Union[List[int], List[Tuple[int, int]]] A list of int (physical device) or a list of tuple of two integers (MIG device) for the corresponding real device indices. """ # pylint: disable=line-too-long return parse_cuda_visible_devices(cuda_visible_devices) @staticmethod def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str: """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. This is an alias of :func:`normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: str The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long return normalize_cuda_visible_devices(cuda_visible_devices) def __new__( cls, index: int | tuple[int, int] | str | None = None, *, uuid: str | None = None, bus_id: str | None = None, ) -> Self: """Create a new instance of Device. The type of the result is determined by the given argument. .. code-block:: python - (index: int) -> PhysicalDevice - (index: (int, int)) -> MigDevice - (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value - (bus_id: str) -> PhysicalDevice Note: This method takes exact 1 non-None argument. Returns: Union[PhysicalDevice, MigDevice] A :class:`PhysicalDevice` instance or a :class:`MigDevice` instance. Raises: TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given index is a tuple but is not consist of two integers. """ if (index, uuid, bus_id).count(None) != 2: raise TypeError( f'Device(index=None, uuid=None, bus_id=None) takes 1 non-None arguments ' f'but (index, uuid, bus_id) = {(index, uuid, bus_id)!r} were given', ) if cls is not Device: # Use the subclass type if the type is explicitly specified return super().__new__(cls) # Auto subclass type inference logic goes here when `cls` is `Device` (e.g., calls `Device(...)`) match: re.Match | None = None if isinstance(index, str): match = cls.UUID_PATTERN.match(index) if match is not None: # passed by UUID index, uuid = None, index elif isinstance(uuid, str): match = cls.UUID_PATTERN.match(uuid) if index is not None: if not isinstance(index, int): if not isinstance(index, tuple): raise TypeError( f'index must be an integer, or a tuple of two integers, or a valid UUID string, ' f'but index = {index!r} was given', ) if not ( len(index) == 2 and isinstance(index[0], int) and isinstance(index[1], int) ): raise TypeError( f'index for MIG device must be a tuple of two integers ' f'but index = {index!r} was given', ) return super().__new__(MigDevice) # type: ignore[return-value] elif uuid is not None and match is not None and match.group('MigMode') is not None: return super().__new__(MigDevice) # type: ignore[return-value] return super().__new__(PhysicalDevice) # type: ignore[return-value] def __init__( self, index: int | str | None = None, *, uuid: str | None = None, bus_id: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index index, uuid, bus_id = ( arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid, bus_id) ) self._name: str = NA self._uuid: str = NA self._bus_id: str = NA self._memory_total: int | NaType = NA self._memory_total_human: str = NA self._nvlink_link_count: int | None = None self._nvlink_throughput_counters: tuple[tuple[int | NaType, int]] | None = None self._is_mig_device: bool | None = None self._cuda_index: int | None = None self._cuda_compute_capability: tuple[int, int] | NaType | None = None self._handle: libnvml.c_nvmlDevice_t | None if index is not None: self._nvml_index = index # type: ignore[assignment] try: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByIndex', index, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._name = 'ERROR: GPU is Lost' except libnvml.NVMLError_Unknown: self._handle = None self._name = 'ERROR: Unknown' else: try: if uuid is not None: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False, ) else: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByPciBusId', bus_id, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._nvml_index = NA # type: ignore[assignment] self._name = 'ERROR: GPU is Lost' except libnvml.NVMLError_Unknown: self._handle = None self._nvml_index = NA # type: ignore[assignment] self._name = 'ERROR: Unknown' else: self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle) self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA) self._lock: threading.RLock = threading.RLock() self._ident: tuple[Hashable, str] = (self.index, self.uuid()) self._hash: int | None = None def __repr__(self) -> str: """Return a string representation of the device.""" return '{}(index={}, name={!r}, total_memory={})'.format( # noqa: UP032 self.__class__.__name__, self.index, self.name(), self.memory_total_human(), ) def __eq__(self, other: object) -> bool: """Test equality to other object.""" if not isinstance(other, Device): return NotImplemented return self._ident == other._ident def __hash__(self) -> int: """Return a hash value of the device.""" if self._hash is None: self._hash = hash(self._ident) return self._hash def __getattr__(self, name: str) -> Any | Callable[..., Any]: """Get the object attribute. If the attribute is not defined, make a method from ``pynvml.nvmlDeviceGet(handle)``. The attribute name will be converted to PascalCase string. Raises: AttributeError: If the attribute is not defined in ``pynvml.py``. Examples: >>> device = Device(0) >>> # Method `cuda_compute_capability` is not implemented in the class definition >>> PhysicalDevice.cuda_compute_capability AttributeError: type object 'Device' has no attribute 'cuda_compute_capability' >>> # Dynamically create a new method from `pynvml.nvmlDeviceGetCudaComputeCapability(device.handle, *args, **kwargs)` >>> device.cuda_compute_capability >>> device.cuda_compute_capability() (8, 6) """ # pylint: disable=line-too-long try: return super().__getattr__(name) # type: ignore[misc] except AttributeError: if name == '_cache': raise if self._handle is None: return lambda: NA match = libnvml.VERSIONED_PATTERN.match(name) if match is not None: name = match.group('name') suffix = match.group('suffix') else: suffix = '' try: pascal_case = name.title().replace('_', '') func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix) except AttributeError: pascal_case = ''.join( part[:1].upper() + part[1:] for part in filter(None, name.split('_')) ) func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix) def attribute(*args: Any, **kwargs: Any) -> Any: try: return libnvml.nvmlQuery( func, self._handle, *args, **kwargs, ignore_errors=False, ) except libnvml.NVMLError_NotSupported: return NA attribute.__name__ = name attribute.__qualname__ = f'{self.__class__.__name__}.{name}' setattr(self, name, attribute) return attribute def __reduce__(self) -> tuple[type[Device], tuple[int | tuple[int, int]]]: """Return state information for pickling.""" return self.__class__, (self._nvml_index,) @property def index(self) -> int | tuple[int, int]: """The NVML index of the device. Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ return self._nvml_index @property def nvml_index(self) -> int | tuple[int, int]: """The NVML index of the device. Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ return self._nvml_index @property def physical_index(self) -> int: """The index of the physical device. Returns: int An int for the physical device index. For MIG devices, returns the index of the parent physical device. """ return self._nvml_index # type: ignore[return-value] # will be overridden in MigDevice @property def handle(self) -> libnvml.c_nvmlDevice_t | None: """The NVML device handle.""" return self._handle @property def cuda_index(self) -> int: """The CUDA device index. The value will be evaluated on the first call. Raises: RuntimeError: If the current device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ if self._cuda_index is None: visible_device_indices = self.parse_cuda_visible_devices() try: self._cuda_index = visible_device_indices.index(self.index) # type: ignore[arg-type] except ValueError as ex: raise RuntimeError( f'CUDA Error: Device(index={self.index}) is not visible to CUDA applications', ) from ex return self._cuda_index def name(self) -> str | NaType: """The official product name of the GPU. This is an alphanumeric string. For all products. Returns: Union[str, NaType] The official product name, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ if self._handle is not None and self._name is NA: self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self._handle) return self._name def uuid(self) -> str | NaType: """This value is the globally unique immutable alphanumeric identifier of the GPU. It does not correspond to any physical label on the board. Returns: Union[str, NaType] The UUID of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ if self._handle is not None and self._uuid is NA: self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self._handle) return self._uuid def bus_id(self) -> str | NaType: """PCI bus ID as "domain:bus:device.function", in hex. Returns: Union[str, NaType] The PCI bus ID of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pci.bus_id """ if self._handle is not None and self._bus_id is NA: self._bus_id = libnvml.nvmlQuery( lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId, self._handle, ) return self._bus_id def serial(self) -> str | NaType: """This number matches the serial number physically printed on each board. It is a globally unique immutable alphanumeric value. Returns: Union[str, NaType] The serial number of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=serial """ if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetSerial', self._handle) return NA @memoize_when_activated def memory_info(self) -> MemoryInfo: # in bytes """Return a named tuple with memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable. """ if self._handle is not None: memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self._handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo( total=memory_info.total, free=memory_info.free, used=memory_info.used, ) return MemoryInfo(total=NA, free=NA, used=NA) def memory_total(self) -> int | NaType: # in bytes """Total installed GPU memory in bytes. Returns: Union[int, NaType] Total installed GPU memory in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.total """ if self._memory_total is NA: self._memory_total = self.memory_info().total return self._memory_total def memory_used(self) -> int | NaType: # in bytes """Total memory allocated by active contexts in bytes. Returns: Union[int, NaType] Total memory allocated by active contexts in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.used """ return self.memory_info().used def memory_free(self) -> int | NaType: # in bytes """Total free memory in bytes. Returns: Union[int, NaType] Total free memory in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.free """ return self.memory_info().free def memory_total_human(self) -> str | NaType: # in human readable """Total installed GPU memory in human readable format. Returns: Union[str, NaType] Total installed GPU memory in human readable format, or :const:`nvitop.NA` when not applicable. """ if self._memory_total_human is NA: self._memory_total_human = bytes2human(self.memory_total()) return self._memory_total_human def memory_used_human(self) -> str | NaType: # in human readable """Total memory allocated by active contexts in human readable format. Returns: Union[int, NaType] Total memory allocated by active contexts in human readable format, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long return bytes2human(self.memory_used()) def memory_free_human(self) -> str | NaType: # in human readable """Total free memory in human readable format. Returns: Union[int, NaType] Total free memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.memory_free()) def memory_percent(self) -> float | NaType: # in percentage """The percentage of used memory over total memory (``0 <= p <= 100``). Returns: Union[float, NaType] The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable. """ total, _, used = self.memory_info() if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int): return round(100.0 * used / total, 1) return NA def memory_usage(self) -> str: # string of used memory over total memory (in human readable) """The used memory over total memory in human readable format. Returns: str The used memory over total memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long return f'{self.memory_used_human()} / {self.memory_total_human()}' @memoize_when_activated def bar1_memory_info(self) -> MemoryInfo: # in bytes """Return a named tuple with BAR1 memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with BAR1 memory information, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long if self._handle is not None: memory_info = libnvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self._handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo( total=memory_info.bar1Total, free=memory_info.bar1Free, used=memory_info.bar1Used, ) return MemoryInfo(total=NA, free=NA, used=NA) def bar1_memory_total(self) -> int | NaType: # in bytes """Total BAR1 memory in bytes. Returns: Union[int, NaType] Total BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().total def bar1_memory_used(self) -> int | NaType: # in bytes """Total used BAR1 memory in bytes. Returns: Union[int, NaType] Total used BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().used def bar1_memory_free(self) -> int | NaType: # in bytes """Total free BAR1 memory in bytes. Returns: Union[int, NaType] Total free BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().free def bar1_memory_total_human(self) -> str | NaType: # in human readable """Total BAR1 memory in human readable format. Returns: Union[int, NaType] Total BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_total()) def bar1_memory_used_human(self) -> str | NaType: # in human readable """Total used BAR1 memory in human readable format. Returns: Union[int, NaType] Total used BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_used()) def bar1_memory_free_human(self) -> str | NaType: # in human readable """Total free BAR1 memory in human readable format. Returns: Union[int, NaType] Total free BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_free()) def bar1_memory_percent(self) -> float | NaType: # in percentage """The percentage of used BAR1 memory over total BAR1 memory (0 <= p <= 100). Returns: Union[float, NaType] The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long total, _, used = self.bar1_memory_info() if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int): return round(100.0 * used / total, 1) return NA def bar1_memory_usage(self) -> str: # in human readable """The used BAR1 memory over total BAR1 memory in human readable format. Returns: str The used BAR1 memory over total BAR1 memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long return f'{self.bar1_memory_used_human()} / {self.bar1_memory_total_human()}' @memoize_when_activated def utilization_rates(self) -> UtilizationRates: # in percentage """Return a named tuple with GPU utilization rates (in percentage) for the device. Returns: UtilizationRates(gpu, memory, encoder, decoder) A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long gpu, memory, encoder, decoder = NA, NA, NA, NA if self._handle is not None: utilization_rates = libnvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self._handle) if libnvml.nvmlCheckReturn(utilization_rates): gpu, memory = utilization_rates.gpu, utilization_rates.memory encoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetEncoderUtilization', self._handle) if libnvml.nvmlCheckReturn(encoder_utilization, list) and len(encoder_utilization) > 0: encoder = encoder_utilization[0] decoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetDecoderUtilization', self._handle) if libnvml.nvmlCheckReturn(decoder_utilization, list) and len(decoder_utilization) > 0: decoder = decoder_utilization[0] return UtilizationRates(gpu=gpu, memory=memory, encoder=encoder, decoder=decoder) def gpu_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which one or more kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] The GPU utilization rate in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=utilization.gpu """ return self.utilization_rates().gpu gpu_percent = gpu_utilization # in percentage def memory_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] The memory bandwidth utilization rate of the GPU in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=utilization.memory """ # pylint: disable=line-too-long return self.utilization_rates().memory def encoder_utilization(self) -> int | NaType: # in percentage """The encoder utilization rate in percentage. Returns: Union[int, NaType] The encoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ return self.utilization_rates().encoder def decoder_utilization(self) -> int | NaType: # in percentage """The decoder utilization rate in percentage. Returns: Union[int, NaType] The decoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ return self.utilization_rates().decoder @memoize_when_activated def clock_infos(self) -> ClockInfos: # in MHz """Return a named tuple with current clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with current clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long graphics, sm, memory, video = NA, NA, NA, NA if self._handle is not None: graphics = libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self._handle, libnvml.NVML_CLOCK_GRAPHICS, ) sm = libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self._handle, libnvml.NVML_CLOCK_SM, ) memory = libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self._handle, libnvml.NVML_CLOCK_MEM, ) video = libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self._handle, libnvml.NVML_CLOCK_VIDEO, ) return ClockInfos(graphics=graphics, sm=sm, memory=memory, video=video) clocks = clock_infos @memoize_when_activated def max_clock_infos(self) -> ClockInfos: # in MHz """Return a named tuple with maximum clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with maximum clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long if self._handle is not None: graphics = libnvml.nvmlQuery( 'nvmlDeviceGetMaxClockInfo', self._handle, libnvml.NVML_CLOCK_GRAPHICS, ) sm = libnvml.nvmlQuery( 'nvmlDeviceGetMaxClockInfo', self._handle, libnvml.NVML_CLOCK_SM, ) memory = libnvml.nvmlQuery( 'nvmlDeviceGetMaxClockInfo', self._handle, libnvml.NVML_CLOCK_MEM, ) video = libnvml.nvmlQuery( 'nvmlDeviceGetMaxClockInfo', self._handle, libnvml.NVML_CLOCK_VIDEO, ) self._max_clock_infos = ClockInfos(graphics=graphics, sm=sm, memory=memory, video=video) return self._max_clock_infos max_clocks = max_clock_infos def clock_speed_infos(self) -> ClockSpeedInfos: # in MHz """Return a named tuple with the current and the maximum clock speeds (in MHz) for the device. Returns: ClockSpeedInfos(current, max) A named tuple with the current and the maximum clock speeds (in MHz) for the device. """ return ClockSpeedInfos(current=self.clock_infos(), max=self.max_clock_infos()) def graphics_clock(self) -> int | NaType: # in MHz """Current frequency of graphics (shader) clock in MHz. Returns: Union[int, NaType] The current frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.graphics """ # pylint: disable=line-too-long return self.clock_infos().graphics def sm_clock(self) -> int | NaType: # in MHz """Current frequency of SM (Streaming Multiprocessor) clock in MHz. Returns: Union[int, NaType] The current frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.sm """ # pylint: disable=line-too-long return self.clock_infos().sm def memory_clock(self) -> int | NaType: # in MHz """Current frequency of memory clock in MHz. Returns: Union[int, NaType] The current frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.memory """ return self.clock_infos().memory def video_clock(self) -> int | NaType: # in MHz """Current frequency of video encoder/decoder clock in MHz. Returns: Union[int, NaType] The current frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.video """ # pylint: disable=line-too-long return self.clock_infos().video def max_graphics_clock(self) -> int | NaType: # in MHz """Maximum frequency of graphics (shader) clock in MHz. Returns: Union[int, NaType] The maximum frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.graphics """ # pylint: disable=line-too-long return self.max_clock_infos().graphics def max_sm_clock(self) -> int | NaType: # in MHz """Maximum frequency of SM (Streaming Multiprocessor) clock in MHz. Returns: Union[int, NaType] The maximum frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.sm """ # pylint: disable=line-too-long return self.max_clock_infos().sm def max_memory_clock(self) -> int | NaType: # in MHz """Maximum frequency of memory clock in MHz. Returns: Union[int, NaType] The maximum frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.memory """ return self.max_clock_infos().memory def max_video_clock(self) -> int | NaType: # in MHz """Maximum frequency of video encoder/decoder clock in MHz. Returns: Union[int, NaType] The maximum frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.video """ # pylint: disable=line-too-long return self.max_clock_infos().video def fan_speed(self) -> int | NaType: # in percentage """The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. Returns: Union[int, NaType] The fan speed value in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=fan.speed """ # pylint: disable=line-too-long if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self._handle) return NA def temperature(self) -> int | NaType: # in Celsius """Core GPU temperature in degrees C. Returns: Union[int, NaType] The core GPU temperature in Celsius degrees, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=temperature.gpu """ if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetTemperature', self._handle, libnvml.NVML_TEMPERATURE_GPU, ) return NA @memoize_when_activated def power_usage(self) -> int | NaType: # in milliwatts (mW) """The last measured power draw for the entire board in milliwatts. Returns: Union[int, NaType] The power draw for the entire board in milliwatts, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 )) """ if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self._handle) return NA power_draw = power_usage # in milliwatts (mW) @memoize_when_activated def power_limit(self) -> int | NaType: # in milliwatts (mW) """The software power limit in milliwatts. Set by software like nvidia-smi. Returns: Union[int, NaType] The software power limit in milliwatts, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 )) """ if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self._handle) return NA def power_status(self) -> str: # string of power usage over power limit in watts (W) """The string of power usage over power limit in watts. Returns: str The string of power usage over power limit in watts, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long power_usage = self.power_usage() power_limit = self.power_limit() if libnvml.nvmlCheckReturn(power_usage, int): power_usage = f'{round(power_usage / 1000)}W' # type: ignore[assignment] if libnvml.nvmlCheckReturn(power_limit, int): power_limit = f'{round(power_limit / 1000)}W' # type: ignore[assignment] return f'{power_usage} / {power_limit}' def pcie_throughput(self) -> ThroughputInfo: # in KiB/s """The current PCIe throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: ThroughputInfo(tx, rx) A named tuple with current PCIe throughput in KiB/s, the item could be :const:`nvitop.NA` when not applicable. """ return ThroughputInfo(tx=self.pcie_tx_throughput(), rx=self.pcie_rx_throughput()) @memoize_when_activated def pcie_tx_throughput(self) -> int | NaType: # in KiB/s """The current PCIe transmit throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[int, NaType] The current PCIe transmit throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', self._handle, libnvml.NVML_PCIE_UTIL_RX_BYTES, ) return NA @memoize_when_activated def pcie_rx_throughput(self) -> int | NaType: # in KiB/s """The current PCIe receive throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[int, NaType] The current PCIe receive throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', self._handle, libnvml.NVML_PCIE_UTIL_RX_BYTES, ) return NA def pcie_tx_throughput_human(self) -> str | NaType: # in human readable """The current PCIe transmit throughput in human readable format. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[str, NaType] The current PCIe transmit throughput in human readable format, or :const:`nvitop.NA` when not applicable. """ tx = self.pcie_tx_throughput() if libnvml.nvmlCheckReturn(tx, int): return f'{bytes2human(tx * 1024)}/s' return NA def pcie_rx_throughput_human(self) -> str | NaType: # in human readable """The current PCIe receive throughput in human readable format. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[str, NaType] The current PCIe receive throughput in human readable format, or :const:`nvitop.NA` when not applicable. """ rx = self.pcie_rx_throughput() if libnvml.nvmlCheckReturn(rx, int): return f'{bytes2human(rx * 1024)}/s' return NA def nvlink_link_count(self) -> int: """The number of NVLinks that the GPU has. Returns: Union[int, NaType] The number of NVLinks that the GPU has. """ if self._handle is not None and self._nvlink_link_count is None: ((nvlink_link_count, _),) = libnvml.nvmlQueryFieldValues( self._handle, [libnvml.NVML_FI_DEV_NVLINK_LINK_COUNT], ) if libnvml.nvmlCheckReturn(nvlink_link_count, int): self._nvlink_link_count = nvlink_link_count # type: ignore[assignment] if self._nvlink_link_count is None: self._nvlink_link_count = 0 return self._nvlink_link_count @memoize_when_activated def nvlink_throughput(self, interval: float | None = None) -> list[ThroughputInfo]: # in KiB/s """The current NVLink throughput for each NVLink in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: List[ThroughputInfo(tx, rx)] A list of named tuples with current NVLink throughput for each NVLink in KiB/s, the item could be :const:`nvitop.NA` when not applicable. """ if self._handle is None: return [] nvlink_link_count = self.nvlink_link_count() if nvlink_link_count == 0: return [] def query_nvlink_throughput_counters() -> tuple[tuple[int | NaType, int]]: return tuple( # type: ignore[return-value] libnvml.nvmlQueryFieldValues( self._handle, [ # type: ignore[arg-type] (libnvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX, i) for i in range(nvlink_link_count) ] + [ (libnvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX, i) for i in range(nvlink_link_count) ], ), ) if interval is not None: if not interval >= 0.0: raise ValueError(f'`interval` must be a non-negative number, got {interval!r}.') if interval > 0.0: self._nvlink_throughput_counters = query_nvlink_throughput_counters() time.sleep(interval) if self._nvlink_throughput_counters is None: self._nvlink_throughput_counters = query_nvlink_throughput_counters() time.sleep(0.02) # 20ms old_throughput_counters = self._nvlink_throughput_counters new_throughput_counters = query_nvlink_throughput_counters() throughputs: list[int | NaType] = [] for (old_counter, old_timestamp), (new_counter, new_timestamp) in zip( old_throughput_counters, new_throughput_counters, ): if ( libnvml.nvmlCheckReturn(old_counter, int) and libnvml.nvmlCheckReturn(new_counter, int) and new_timestamp > old_timestamp ): throughputs.append( round( 1_000_000 * (new_counter - old_counter) / (new_timestamp - old_timestamp), ), ) else: throughputs.append(NA) self._nvlink_throughput_counters = new_throughput_counters return [ ThroughputInfo(tx=tx, rx=rx) for tx, rx in zip(throughputs[:nvlink_link_count], throughputs[nvlink_link_count:]) ] def nvlink_total_throughput(self, interval: float | None = None) -> ThroughputInfo: # in KiB/s """The total NVLink throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: ThroughputInfo(tx, rx) A named tuple with the total NVLink throughput for all NVLinks in KiB/s, the item could be :const:`nvitop.NA` when not applicable. """ tx_throughputs = [] rx_throughputs = [] for tx, rx in self.nvlink_throughput(interval=interval): if libnvml.nvmlCheckReturn(tx, int): tx_throughputs.append(tx) if libnvml.nvmlCheckReturn(rx, int): rx_throughputs.append(rx) return ThroughputInfo( tx=sum(tx_throughputs) if tx_throughputs else NA, rx=sum(rx_throughputs) if rx_throughputs else NA, ) def nvlink_mean_throughput(self, interval: float | None = None) -> ThroughputInfo: # in KiB/s """The mean NVLink throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: ThroughputInfo(tx, rx) A named tuple with the mean NVLink throughput for all NVLinks in KiB/s, the item could be :const:`nvitop.NA` when not applicable. """ tx_throughputs = [] rx_throughputs = [] for tx, rx in self.nvlink_throughput(interval=interval): if libnvml.nvmlCheckReturn(tx, int): tx_throughputs.append(tx) if libnvml.nvmlCheckReturn(rx, int): rx_throughputs.append(rx) return ThroughputInfo( tx=round(sum(tx_throughputs) / len(tx_throughputs)) if tx_throughputs else NA, rx=round(sum(rx_throughputs) / len(rx_throughputs)) if rx_throughputs else NA, ) def nvlink_tx_throughput(self, interval: float | None = None) -> list[int | NaType]: # in KiB/s """The current NVLink transmit data throughput in KiB/s for each NVLink. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: List[Union[int, NaType]] The current NVLink transmit data throughput in KiB/s for each NVLink, or :const:`nvitop.NA` when not applicable. """ return [tx for tx, _ in self.nvlink_throughput(interval=interval)] def nvlink_mean_tx_throughput(self, interval: float | None = None) -> int | NaType: # in KiB/s """The mean NVLink transmit data throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[int, NaType] The mean NVLink transmit data throughput for all NVLinks in KiB/s, or :const:`nvitop.NA` when not applicable. """ return self.nvlink_mean_throughput(interval=interval).tx def nvlink_total_tx_throughput(self, interval: float | None = None) -> int | NaType: # in KiB/s """The total NVLink transmit data throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[int, NaType] The total NVLink transmit data throughput for all NVLinks in KiB/s, or :const:`nvitop.NA` when not applicable. """ return self.nvlink_total_throughput(interval=interval).tx def nvlink_rx_throughput(self, interval: float | None = None) -> list[int | NaType]: # in KiB/s """The current NVLink receive data throughput for each NVLink in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[int, NaType] The current NVLink receive data throughput for each NVLink in KiB/s, or :const:`nvitop.NA` when not applicable. """ return [rx for _, rx in self.nvlink_throughput(interval=interval)] def nvlink_mean_rx_throughput(self, interval: float | None = None) -> int | NaType: # in KiB/s """The mean NVLink receive data throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[int, NaType] The mean NVLink receive data throughput for all NVLinks in KiB/s, or :const:`nvitop.NA` when not applicable. """ return self.nvlink_mean_throughput(interval=interval).rx def nvlink_total_rx_throughput(self, interval: float | None = None) -> int | NaType: # in KiB/s """The total NVLink receive data throughput for all NVLinks in KiB/s. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[int, NaType] The total NVLink receive data throughput for all NVLinks in KiB/s, or :const:`nvitop.NA` when not applicable. """ return self.nvlink_total_throughput(interval=interval).rx def nvlink_tx_throughput_human( self, interval: float | None = None, ) -> list[str | NaType]: # in human readable """The current NVLink transmit data throughput for each NVLink in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The current NVLink transmit data throughput for each NVLink in human readable format, or :const:`nvitop.NA` when not applicable. """ return [ f'{bytes2human(tx * 1024)}/s' if libnvml.nvmlCheckReturn(tx, int) else NA for tx in self.nvlink_tx_throughput(interval=interval) ] def nvlink_mean_tx_throughput_human( self, interval: float | None = None, ) -> str | NaType: # in human readable """The mean NVLink transmit data throughput for all NVLinks in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The mean NVLink transmit data throughput for all NVLinks in human readable format, or :const:`nvitop.NA` when not applicable. """ mean_tx = self.nvlink_mean_tx_throughput(interval=interval) if libnvml.nvmlCheckReturn(mean_tx, int): return f'{bytes2human(mean_tx * 1024)}/s' return NA def nvlink_total_tx_throughput_human( self, interval: float | None = None, ) -> str | NaType: # in human readable """The total NVLink transmit data throughput for all NVLinks in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The total NVLink transmit data throughput for all NVLinks in human readable format, or :const:`nvitop.NA` when not applicable. """ total_tx = self.nvlink_total_tx_throughput(interval=interval) if libnvml.nvmlCheckReturn(total_tx, int): return f'{bytes2human(total_tx * 1024)}/s' return NA def nvlink_rx_throughput_human( self, interval: float | None = None, ) -> list[str | NaType]: # in human readable """The current NVLink receive data throughput for each NVLink in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The current NVLink receive data throughput for each NVLink in human readable format, or :const:`nvitop.NA` when not applicable. """ return [ f'{bytes2human(rx * 1024)}/s' if libnvml.nvmlCheckReturn(rx, int) else NA for rx in self.nvlink_rx_throughput(interval=interval) ] def nvlink_mean_rx_throughput_human( self, interval: float | None = None, ) -> str | NaType: # in human readable """The mean NVLink receive data throughput for all NVLinks in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The mean NVLink receive data throughput for all NVLinks in human readable format, or :const:`nvitop.NA` when not applicable. """ mean_rx = self.nvlink_mean_rx_throughput(interval=interval) if libnvml.nvmlCheckReturn(mean_rx, int): return f'{bytes2human(mean_rx * 1024)}/s' return NA def nvlink_total_rx_throughput_human( self, interval: float | None = None, ) -> str | NaType: # in human readable """The total NVLink receive data throughput for all NVLinks in human readable format. This function is querying data counters between methods calls and thus is the NVLink throughput over that interval. For the first call, the function is blocking for 20ms to get the first data counters. Args: interval (Optional[float]): The interval in seconds between two calls to get the NVLink throughput. If ``interval`` is a positive number, compares throughput counters before and after the interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares throughput counters since the last call, returning immediately (non-blocking). Returns: Union[str, NaType] The total NVLink receive data throughput for all NVLinks in human readable format, or :const:`nvitop.NA` when not applicable. """ total_rx = self.nvlink_total_rx_throughput(interval=interval) if libnvml.nvmlCheckReturn(total_rx, int): return f'{bytes2human(total_rx * 1024)}/s' return NA def display_active(self) -> str | NaType: """A flag that indicates whether a display is initialized on the GPU's (e.g. memory is allocated on the device for display). Display can be active even when no monitor is physically attached. "Enabled" indicates an active display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if not an active display device. - :const:`'Enabled'`: if an active display device. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=display_active """ # pylint: disable=line-too-long if self._handle is not None: return { 0: 'Disabled', 1: 'Enabled', }.get(libnvml.nvmlQuery('nvmlDeviceGetDisplayActive', self._handle), NA) return NA def display_mode(self) -> str | NaType: """A flag that indicates whether a physical display (e.g. monitor) is currently connected to any of the GPU's connectors. "Enabled" indicates an attached display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if the display mode is disabled. - :const:`'Enabled'`: if the display mode is enabled. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=display_mode """ # pylint: disable=line-too-long if self._handle is not None: return { 0: 'Disabled', 1: 'Enabled', }.get(libnvml.nvmlQuery('nvmlDeviceGetDisplayMode', self._handle), NA) return NA def current_driver_model(self) -> str | NaType: """The driver model currently in use. Always "N/A" on Linux. On Windows, the TCC (WDM) and WDDM driver models are supported. The TCC driver model is optimized for compute applications. I.E. kernel launch times will be quicker with TCC. The WDDM driver model is designed for graphics applications and is not recommended for compute applications. Linux does not support multiple driver models, and will always have the value of "N/A". Returns: Union[str, NaType] - :const:`'WDDM'`: for WDDM driver model on Windows. - :const:`'WDM'`: for TTC (WDM) driver model on Windows. - :const:`nvitop.NA`: if not applicable, e.g. on Linux. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=driver_model.current """ if self._handle is not None: return { libnvml.NVML_DRIVER_WDDM: 'WDDM', libnvml.NVML_DRIVER_WDM: 'WDM', }.get(libnvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self._handle), NA) return NA driver_model = current_driver_model def persistence_mode(self) -> str | NaType: """A flag that indicates whether persistence mode is enabled for the GPU. Value is either "Enabled" or "Disabled". When persistence mode is enabled the NVIDIA driver remains loaded even when no active clients, such as X11 or nvidia-smi, exist. This minimizes the driver load latency associated with running dependent apps, such as CUDA programs. Linux only. Returns: Union[str, NaType] - :const:`'Disabled'`: if the persistence mode is disabled. - :const:`'Enabled'`: if the persistence mode is enabled. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=persistence_mode """ # pylint: disable=line-too-long if self._handle is not None: return { 0: 'Disabled', 1: 'Enabled', }.get(libnvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self._handle), NA) return NA def performance_state(self) -> str | NaType: """The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance). Returns: Union[str, NaType] The current performance state in format ``P``, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pstate """ # pylint: disable=line-too-long if self._handle is not None: performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self._handle) if libnvml.nvmlCheckReturn(performance_state, int): performance_state = 'P' + str(performance_state) return performance_state return NA def total_volatile_uncorrected_ecc_errors(self) -> int | NaType: """Total errors detected across entire chip. Returns: Union[int, NaType] The total number of uncorrected errors in volatile ECC memory, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=ecc.errors.uncorrected.volatile.total """ # pylint: disable=line-too-long if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetTotalEccErrors', self._handle, libnvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, libnvml.NVML_VOLATILE_ECC, ) return NA def compute_mode(self) -> str | NaType: """The compute mode flag indicates whether individual or multiple compute applications may run on the GPU. Returns: Union[str, NaType] - :const:`'Default'`: means multiple contexts are allowed per device. - :const:`'Exclusive Thread'`: deprecated, use Exclusive Process instead - :const:`'Prohibited'`: means no contexts are allowed per device (no compute apps). - :const:`'Exclusive Process'`: means only one context is allowed per device, usable from multiple threads at a time. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_mode """ # pylint: disable=line-too-long if self._handle is not None: return { libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default', libnvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread', libnvml.NVML_COMPUTEMODE_PROHIBITED: 'Prohibited', libnvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 'Exclusive Process', }.get(libnvml.nvmlQuery('nvmlDeviceGetComputeMode', self._handle), NA) return NA def cuda_compute_capability(self) -> tuple[int, int] | NaType: """The CUDA compute capability for the device. Returns: Union[Tuple[int, int], NaType] The CUDA compute capability version in format ``(major, minor)``, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_cap """ if self._handle is not None: if self._cuda_compute_capability is None: self._cuda_compute_capability = libnvml.nvmlQuery( 'nvmlDeviceGetCudaComputeCapability', self._handle, ) return self._cuda_compute_capability return NA def is_mig_device(self) -> bool: """Return whether or not the device is a MIG device.""" if self._handle is not None: if self._is_mig_device is None: is_mig_device = libnvml.nvmlQuery( 'nvmlDeviceIsMigDeviceHandle', self._handle, default=False, ignore_function_not_found=True, ) # nvmlDeviceIsMigDeviceHandle returns c_uint self._is_mig_device = bool( is_mig_device, ) return self._is_mig_device return False def mig_mode(self) -> str | NaType: """The MIG mode that the GPU is currently operating under. Returns: Union[str, NaType] - :const:`'Disabled'`: if the MIG mode is disabled. - :const:`'Enabled'`: if the MIG mode is enabled. - :const:`nvitop.NA`: if not applicable, e.g. the GPU does not support MIG mode. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=mig.mode.current """ if self._handle is None: return NA if self.is_mig_device(): return NA mig_mode, *_ = libnvml.nvmlQuery( 'nvmlDeviceGetMigMode', self._handle, default=(NA, NA), ignore_function_not_found=True, ) return {0: 'Disabled', 1: 'Enabled'}.get(mig_mode, NA) def is_mig_mode_enabled(self) -> bool: """Test whether the MIG mode is enabled on the device. Return :data:`False` if MIG mode is disabled or the device does not support MIG mode. """ return boolify(self.mig_mode()) def max_mig_device_count(self) -> int: """Return the maximum number of MIG instances the device supports. This method will return 0 if the device does not support MIG mode. """ return 0 # implemented in PhysicalDevice def mig_devices(self) -> list[MigDevice]: """Return a list of children MIG devices of the current device. This method will return an empty list if the MIG mode is disabled or the device does not support MIG mode. """ return [] # implemented in PhysicalDevice def is_leaf_device(self) -> bool: """Test whether the device is a physical device with MIG mode disabled or a MIG device. Return :data:`True` if the device is a physical device with MIG mode disabled or a MIG device. Otherwise, return :data:`False` if the device is a physical device with MIG mode enabled. """ return self.is_mig_device() or not self.is_mig_mode_enabled() def to_leaf_devices( self, ) -> list[PhysicalDevice] | list[MigDevice] | list[CudaDevice] | list[CudaMigDevice]: """Return a list of leaf devices. Note that a CUDA device is always a leaf device. """ if isinstance(self, CudaDevice) or self.is_leaf_device(): return [self] # type: ignore[return-value] return self.mig_devices() def processes(self) -> dict[int, GpuProcess]: """Return a dictionary of processes running on the GPU. Returns: Dict[int, GpuProcess] A dictionary mapping PID to GPU process instance. """ if self._handle is None: return {} processes = {} found_na = False for type, func in ( # pylint: disable=redefined-builtin ('C', 'nvmlDeviceGetComputeRunningProcesses'), ('G', 'nvmlDeviceGetGraphicsRunningProcesses'), ): for p in libnvml.nvmlQuery(func, self._handle, default=()): if isinstance(p.usedGpuMemory, int): gpu_memory = p.usedGpuMemory else: # Used GPU memory is `N/A` on Windows Display Driver Model (WDDM) # or on MIG-enabled GPUs gpu_memory = NA # type: ignore[assignment] found_na = True proc = processes[p.pid] = self.GPU_PROCESS_CLASS( pid=p.pid, device=self, gpu_memory=gpu_memory, gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX), compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX), ) proc.type = proc.type + type if len(processes) > 0: samples = libnvml.nvmlQuery( 'nvmlDeviceGetProcessUtilization', self._handle, # Only utilization samples that were recorded after this timestamp will be returned. # The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used. # Here we use the timestamp 1 second ago to ensure the record buffer is not empty. time.time_ns() // 1000 - 1000_000, default=(), ) for s in sorted(samples, key=lambda s: s.timeStamp): try: processes[s.pid].set_gpu_utilization(s.smUtil, s.memUtil, s.encUtil, s.decUtil) except KeyError: # noqa: PERF203 pass if not found_na: for pid in set(processes).difference(s.pid for s in samples): processes[pid].set_gpu_utilization(0, 0, 0, 0) return processes def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ with self.oneshot(): return Snapshot( real=self, index=self.index, physical_index=self.physical_index, **{key: getattr(self, key)() for key in self.SNAPSHOT_KEYS}, ) SNAPSHOT_KEYS: ClassVar[list[str]] = [ 'name', 'uuid', 'bus_id', 'memory_info', 'memory_used', 'memory_free', 'memory_total', 'memory_used_human', 'memory_free_human', 'memory_total_human', 'memory_percent', 'memory_usage', 'utilization_rates', 'gpu_utilization', 'memory_utilization', 'encoder_utilization', 'decoder_utilization', 'clock_infos', 'max_clock_infos', 'clock_speed_infos', 'sm_clock', 'memory_clock', 'fan_speed', 'temperature', 'power_usage', 'power_limit', 'power_status', 'pcie_throughput', 'pcie_tx_throughput', 'pcie_rx_throughput', 'pcie_tx_throughput_human', 'pcie_rx_throughput_human', 'display_active', 'display_mode', 'current_driver_model', 'persistence_mode', 'performance_state', 'total_volatile_uncorrected_ecc_errors', 'compute_mode', 'cuda_compute_capability', 'mig_mode', ] # Modified from psutil (https://github.com/giampaolo/psutil) @contextlib.contextmanager def oneshot(self) -> Generator[None]: """A utility context manager which considerably speeds up the retrieval of multiple device information at the same time. Internally different device info (e.g. memory_info, utilization_rates, ...) may be fetched by using the same routine, but only one information is returned and the others are discarded. When using this context manager the internal routine is executed once (in the example below on memory_info()) and the other info are cached. The cache is cleared when exiting the context manager block. The advice is to use this every time you retrieve more than one information about the device. Examples: >>> from nvitop import Device >>> device = Device(0) >>> with device.oneshot(): ... device.memory_info() # collect multiple info ... device.memory_used() # return cached value ... device.memory_free_human() # return cached value ... device.memory_percent() # return cached value """ # pylint: disable=line-too-long with self._lock: # pylint: disable=no-member if hasattr(self, '_cache'): # NOOP: this covers the use case where the user enters the # context twice: # # >>> with device.oneshot(): # ... with device.oneshot(): # ... # # Also, since as_snapshot() internally uses oneshot() # I expect that the code below will be a pretty common # "mistake" that the user will make, so let's guard # against that: # # >>> with device.oneshot(): # ... device.as_snapshot() # ... yield else: try: self.memory_info.cache_activate(self) # type: ignore[attr-defined] self.bar1_memory_info.cache_activate(self) # type: ignore[attr-defined] self.utilization_rates.cache_activate(self) # type: ignore[attr-defined] self.clock_infos.cache_activate(self) # type: ignore[attr-defined] self.max_clock_infos.cache_activate(self) # type: ignore[attr-defined] self.power_usage.cache_activate(self) # type: ignore[attr-defined] self.power_limit.cache_activate(self) # type: ignore[attr-defined] yield finally: self.memory_info.cache_deactivate(self) # type: ignore[attr-defined] self.bar1_memory_info.cache_deactivate(self) # type: ignore[attr-defined] self.utilization_rates.cache_deactivate(self) # type: ignore[attr-defined] self.clock_infos.cache_deactivate(self) # type: ignore[attr-defined] self.max_clock_infos.cache_deactivate(self) # type: ignore[attr-defined] self.power_usage.cache_deactivate(self) # type: ignore[attr-defined] self.power_limit.cache_deactivate(self) # type: ignore[attr-defined] class PhysicalDevice(Device): """Class for physical devices. This is the real GPU installed in the system. """ _nvml_index: int index: int nvml_index: int @property def physical_index(self) -> int: """Zero based index of the GPU. Can change at each boot. Command line equivalent: .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=index """ return self._nvml_index def max_mig_device_count(self) -> int: """Return the maximum number of MIG instances the device supports. This method will return 0 if the device does not support MIG mode. """ if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetMaxMigDeviceCount', self._handle, default=0, ignore_function_not_found=True, ) return 0 def mig_device(self, mig_index: int) -> MigDevice: """Return a child MIG device of the given index. Raises: libnvml.NVMLError: If the device does not support MIG mode or the given MIG device does not exist. """ with _global_physical_device(self): return MigDevice(index=(self.index, mig_index)) def mig_devices(self) -> list[MigDevice]: """Return a list of children MIG devices of the current device. This method will return an empty list if the MIG mode is disabled or the device does not support MIG mode. """ mig_devices = [] if self.is_mig_mode_enabled(): max_mig_device_count = self.max_mig_device_count() with _global_physical_device(self): for mig_index in range(max_mig_device_count): try: mig_device = MigDevice(index=(self.index, mig_index)) except libnvml.NVMLError: # noqa: PERF203 break else: mig_devices.append(mig_device) return mig_devices class MigDevice(Device): # pylint: disable=too-many-instance-attributes """Class for MIG devices.""" _nvml_index: tuple[int, int] nvml_index: tuple[int, int] @classmethod def count(cls) -> int: """The number of total MIG devices aggregated over all physical devices.""" return len(cls.all()) @classmethod def all(cls) -> list[MigDevice]: # type: ignore[override] """Return a list of MIG devices aggregated over all physical devices.""" mig_devices = [] for device in PhysicalDevice.all(): mig_devices.extend(device.mig_devices()) return mig_devices @classmethod def from_indices( # type: ignore[override] # pylint: disable=signature-differs cls, indices: Iterable[tuple[int, int]], ) -> list[MigDevice]: """Return a list of MIG devices of the given indices. Args: indices (Iterable[Tuple[int, int]]): Indices of the MIG devices. Each index is a tuple of two integers. Returns: List[MigDevice] A list of :class:`MigDevice` instances of the given indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ return list(map(cls, indices)) # pylint: disable-next=super-init-not-called def __init__( self, index: tuple[int, int] | str | None = None, *, uuid: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index index, uuid = (arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid)) self._name: str = NA self._uuid: str = NA self._bus_id: str = NA self._memory_total: int | NaType = NA self._memory_total_human: str = NA self._gpu_instance_id: int | NaType = NA self._compute_instance_id: int | NaType = NA self._nvlink_link_count: int | None = None self._nvlink_throughput_counters: tuple[tuple[int | NaType, int]] | None = None self._is_mig_device: bool = True self._cuda_index: int | None = None self._cuda_compute_capability: tuple[int, int] | NaType | None = None if index is not None: self._nvml_index = index # type: ignore[assignment] self._handle = None parent = _get_global_physical_device() if ( parent is None or parent.handle is None or parent.physical_index != self.physical_index ): parent = PhysicalDevice(index=self.physical_index) self._parent = parent if self.parent.handle is not None: try: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetMigDeviceHandleByIndex', self.parent.handle, self.mig_index, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: pass else: self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False) parent_handle = libnvml.nvmlQuery( 'nvmlDeviceGetDeviceHandleFromMigDeviceHandle', self._handle, ignore_errors=False, ) parent_index = libnvml.nvmlQuery( 'nvmlDeviceGetIndex', parent_handle, ignore_errors=False, ) self._parent = PhysicalDevice(index=parent_index) for mig_device in self.parent.mig_devices(): if self.uuid() == mig_device.uuid(): self._nvml_index = mig_device.index break else: raise libnvml.NVMLError_NotFound self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA) self._lock = threading.RLock() self._ident = (self.index, self.uuid()) self._hash = None @property def index(self) -> tuple[int, int]: """The index of the MIG device. This is a tuple of two integers.""" return self._nvml_index @property def physical_index(self) -> int: """The index of the parent physical device.""" return self._nvml_index[0] @property def mig_index(self) -> int: """The index of the MIG device over the all MIG devices of the parent device.""" return self._nvml_index[1] @property def parent(self) -> PhysicalDevice: """The parent physical device.""" return self._parent def gpu_instance_id(self) -> int | NaType: """The gpu instance ID of the MIG device. Returns: Union[int, NaType] The gpu instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ if self._handle is not None and self._gpu_instance_id is NA: self._gpu_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetGpuInstanceId', self._handle, default=UINT_MAX, ) if self._gpu_instance_id == UINT_MAX: self._gpu_instance_id = NA return self._gpu_instance_id def compute_instance_id(self) -> int | NaType: """The compute instance ID of the MIG device. Returns: Union[int, NaType] The compute instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ if self._handle is not None and self._compute_instance_id is NA: self._compute_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetComputeInstanceId', self._handle, default=UINT_MAX, ) if self._compute_instance_id == UINT_MAX: self._compute_instance_id = NA return self._compute_instance_id def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ snapshot = super().as_snapshot() snapshot.mig_index = self.mig_index # type: ignore[attr-defined] return snapshot SNAPSHOT_KEYS: ClassVar[list[str]] = [ *Device.SNAPSHOT_KEYS, 'gpu_instance_id', 'compute_instance_id', ] class CudaDevice(Device): """Class for devices enumerated over the CUDA ordinal. The order can be vary for different ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ :meth:`CudaDevice.__new__()` returns different types depending on the given arguments. .. code-block:: python - (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (nvml_index: int) -> CudaDevice - (nvml_index: (int, int)) -> CudaMigDevice Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' >>> CudaDevice.count() # number of NVIDIA GPUs visible to CUDA applications 4 >>> Device.cuda.count() # use alias in class `Device` 4 >>> CudaDevice.all() # all CUDA visible devices (or `Device.cuda.all()`) [ CudaDevice(cuda_index=0, nvml_index=3, ...), CudaDevice(cuda_index=1, nvml_index=2, ...), ... ] >>> cuda0 = CudaDevice(cuda_index=0) # use CUDA ordinal (or `Device.cuda(0)`) >>> cuda1 = CudaDevice(nvml_index=2) # use NVML ordinal >>> cuda2 = CudaDevice(uuid='GPU-xxxxxx') # use UUID string >>> cuda0.memory_free() # total free memory in bytes 11550654464 >>> cuda0.memory_free_human() # total free memory in human readable format '11016MiB' >>> cuda1.as_snapshot() # takes an onetime snapshot of the device CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, nvml_index=2, ...), ... ) Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the NVML index is out of range. TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given NVML index is a tuple but is not consist of two integers. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long _nvml_index: int index: int nvml_index: int @classmethod def is_available(cls) -> bool: """Test whether there are any CUDA-capable devices available.""" return cls.count() > 0 @classmethod def count(cls) -> int: """The number of GPUs visible to CUDA applications.""" try: return len(super().parse_cuda_visible_devices()) except libnvml.NVMLError: return 0 @classmethod def all(cls) -> list[CudaDevice]: # type: ignore[override] """All CUDA visible devices. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. """ return cls.from_indices() @classmethod def from_indices( # type: ignore[override] cls, indices: int | Iterable[int] | None = None, ) -> list[CudaDevice]: """Return a list of CUDA devices of the given CUDA indices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: indices (Iterable[int]): The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices. Returns: List[CudaDevice] A list of :class:`CudaDevice` of the given CUDA indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ return super().from_cuda_indices(indices) def __new__( cls, cuda_index: int | None = None, *, nvml_index: int | tuple[int, int] | None = None, uuid: str | None = None, ) -> Self: """Create a new instance of CudaDevice. The type of the result is determined by the given argument. .. code-block:: python - (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (nvml_index: int) -> CudaDevice - (nvml_index: (int, int)) -> CudaMigDevice Note: This method takes exact 1 non-None argument. Returns: Union[CudaDevice, CudaMigDevice] A :class:`CudaDevice` instance or a :class:`CudaMigDevice` instance. Raises: TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given NVML index is a tuple but is not consist of two integers. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ if nvml_index is not None and uuid is not None: raise TypeError( f'CudaDevice(cuda_index=None, nvml_index=None, uuid=None) takes 1 non-None arguments ' f'but (cuda_index, nvml_index, uuid) = {(cuda_index, nvml_index, uuid)!r} were given', ) if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = cls.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') nvml_index = cuda_visible_devices[cuda_index] if cls is not CudaDevice: # Use the subclass type if the type is explicitly specified return super().__new__(cls, index=nvml_index, uuid=uuid) # Auto subclass type inference logic goes here when `cls` is `CudaDevice` (e.g., calls `CudaDevice(...)`) if (nvml_index is not None and not isinstance(nvml_index, int)) or is_mig_device_uuid(uuid): return super().__new__(CudaMigDevice, index=nvml_index, uuid=uuid) # type: ignore[return-value] return super().__new__(CudaDevice, index=nvml_index, uuid=uuid) # type: ignore[return-value] def __init__( self, cuda_index: int | None = None, *, nvml_index: int | tuple[int, int] | None = None, uuid: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the NVML index is out of range. RuntimeError: If the given device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = self.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') nvml_index = cuda_visible_devices[cuda_index] super().__init__(index=nvml_index, uuid=uuid) # type: ignore[arg-type] if cuda_index is None: cuda_index = super().cuda_index self._cuda_index: int = cuda_index self._ident: tuple[Hashable, str] = ((self._cuda_index, self.index), self.uuid()) def __repr__(self) -> str: """Return a string representation of the CUDA device.""" return '{}(cuda_index={}, nvml_index={}, name="{}", total_memory={})'.format( # noqa: UP032 self.__class__.__name__, self.cuda_index, self.index, self.name(), self.memory_total_human(), ) def __reduce__(self) -> tuple[type[CudaDevice], tuple[int]]: """Return state information for pickling.""" return self.__class__, (self._cuda_index,) def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ snapshot = super().as_snapshot() snapshot.cuda_index = self.cuda_index # type: ignore[attr-defined] return snapshot Device.cuda = CudaDevice """Shortcut for class :class:`CudaDevice`.""" class CudaMigDevice(CudaDevice, MigDevice): # type: ignore[misc] """Class for CUDA devices that are MIG devices.""" _nvml_index: tuple[int, int] # type: ignore[assignment] index: tuple[int, int] # type: ignore[assignment] nvml_index: tuple[int, int] # type: ignore[assignment] def is_mig_device_uuid(uuid: str | None) -> bool: """Return :data:`True` if the argument is a MIG device UUID, otherwise, return :data:`False`.""" if isinstance(uuid, str): match = Device.UUID_PATTERN.match(uuid) if match is not None and match.group('MigMode') is not None: return True return False def parse_cuda_visible_devices( cuda_visible_devices: str | None = _VALUE_OMITTED, ) -> list[int] | list[tuple[int, int]]: """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. This function is aliased by :meth:`Device.parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: Union[List[int], List[Tuple[int, int]]] A list of int (physical device) or a list of tuple of two integers (MIG device) for the corresponding real device indices. Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> parse_cuda_visible_devices() # parse the `CUDA_VISIBLE_DEVICES` environment variable to NVML indices [6, 5] >>> parse_cuda_visible_devices('0,4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly [0, 4] >>> parse_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # accept abbreviated UUIDs [5, 6] >>> parse_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] >>> parse_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID) [(0, 0)] >>> parse_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID) [(0, 1)] >>> parse_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID) [(0, 1)] >>> parse_cuda_visible_devices('') # empty string [] >>> parse_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal) [] >>> parse_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) [] """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) return _parse_cuda_visible_devices(cuda_visible_devices, format='index') def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str: """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. This function is aliased by :meth:`Device.normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ - `CUDA Device Enumeration for MIG Device `_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: str The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable. Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> normalize_cuda_visible_devices() # normalize the `CUDA_VISIBLE_DEVICES` environment variable to UUID strings 'GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1' >>> normalize_cuda_visible_devices('4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly 'GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0' >>> normalize_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # normalize abbreviated UUIDs 'GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794' >>> normalize_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset 'GPU-,GPU-,...' # all GPU UUIDs >>> normalize_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID) 'MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd' >>> normalize_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID) 'MIG-37b51284-1df4-5451-979d-3231ccb0822e' >>> normalize_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID) 'MIG-37b51284-1df4-5451-979d-3231ccb0822e' >>> normalize_cuda_visible_devices('') # empty string '' >>> normalize_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal) '' >>> normalize_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) '' """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) return ','.join(_parse_cuda_visible_devices(cuda_visible_devices, format='uuid')) # Helper functions ################################################################################# class _PhysicalDeviceAttrs(NamedTuple): index: int # type: ignore[assignment] name: str uuid: str support_mig_mode: bool _PHYSICAL_DEVICE_ATTRS: OrderedDict[str, _PhysicalDeviceAttrs] | None = None _GLOBAL_PHYSICAL_DEVICE: PhysicalDevice | None = None _GLOBAL_PHYSICAL_DEVICE_LOCK: threading.RLock = threading.RLock() def _get_all_physical_device_attrs() -> OrderedDict[str, _PhysicalDeviceAttrs]: global _PHYSICAL_DEVICE_ATTRS # pylint: disable=global-statement if _PHYSICAL_DEVICE_ATTRS is not None: return _PHYSICAL_DEVICE_ATTRS with _GLOBAL_PHYSICAL_DEVICE_LOCK: if _PHYSICAL_DEVICE_ATTRS is None: _PHYSICAL_DEVICE_ATTRS = OrderedDict( [ ( device.uuid(), _PhysicalDeviceAttrs( device.index, device.name(), device.uuid(), libnvml.nvmlCheckReturn(device.mig_mode()), ), ) for device in PhysicalDevice.all() ], ) return _PHYSICAL_DEVICE_ATTRS def _does_any_device_support_mig_mode(uuids: Iterable[str] | None = None) -> bool: physical_device_attrs = _get_all_physical_device_attrs() uuids = uuids or physical_device_attrs.keys() return any(physical_device_attrs[uuid].support_mig_mode for uuid in uuids) @contextlib.contextmanager def _global_physical_device(device: PhysicalDevice) -> Generator[PhysicalDevice]: global _GLOBAL_PHYSICAL_DEVICE # pylint: disable=global-statement with _GLOBAL_PHYSICAL_DEVICE_LOCK: try: _GLOBAL_PHYSICAL_DEVICE = device yield _GLOBAL_PHYSICAL_DEVICE finally: _GLOBAL_PHYSICAL_DEVICE = None def _get_global_physical_device() -> PhysicalDevice: with _GLOBAL_PHYSICAL_DEVICE_LOCK: return _GLOBAL_PHYSICAL_DEVICE # type: ignore[return-value] @overload def _parse_cuda_visible_devices( cuda_visible_devices: str | None, format: Literal['index'], # pylint: disable=redefined-builtin ) -> list[int] | list[tuple[int, int]]: ... @overload def _parse_cuda_visible_devices( cuda_visible_devices: str | None, format: Literal['uuid'], # pylint: disable=redefined-builtin ) -> list[str]: ... @functools.lru_cache() def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-statements cuda_visible_devices: str | None = None, format: Literal['index', 'uuid'] = 'index', # pylint: disable=redefined-builtin ) -> list[int] | list[tuple[int, int]] | list[str]: """The underlining implementation for :meth:`parse_cuda_visible_devices`. The result will be cached.""" assert format in {'index', 'uuid'} try: physical_device_attrs = _get_all_physical_device_attrs() except libnvml.NVMLError: return [] gpu_uuids = set(physical_device_attrs) try: raw_uuids = ( subprocess.check_output( # noqa: S603 [ sys.executable, '-c', textwrap.dedent( f""" import nvitop.api.device print( ','.join( nvitop.api.device._parse_cuda_visible_devices_to_uuids( {cuda_visible_devices!r}, verbose=False, ), ), ) """, ), ], ) .decode('utf-8', errors='replace') .strip() .split(',') ) except subprocess.CalledProcessError: pass else: uuids = [ uuid if uuid in gpu_uuids else uuid.replace('GPU', 'MIG', 1) for uuid in map('GPU-{}'.format, raw_uuids) ] if gpu_uuids.issuperset(uuids) and not _does_any_device_support_mig_mode(uuids): if format == 'uuid': return uuids return [physical_device_attrs[uuid].index for uuid in uuids] cuda_visible_devices = ','.join(uuids) if cuda_visible_devices is None: cuda_visible_devices = ','.join(physical_device_attrs.keys()) devices: list[Device] = [] presented: set[str] = set() use_integer_identifiers: bool | None = None def from_index_or_uuid(index_or_uuid: int | str) -> Device: nonlocal use_integer_identifiers if isinstance(index_or_uuid, str): if index_or_uuid.isdigit(): index_or_uuid = int(index_or_uuid) elif Device.UUID_PATTERN.match(index_or_uuid) is None: raise libnvml.NVMLError_NotFound if use_integer_identifiers is None: use_integer_identifiers = isinstance(index_or_uuid, int) if isinstance(index_or_uuid, int) and use_integer_identifiers: return Device(index=index_or_uuid) if isinstance(index_or_uuid, str) and not use_integer_identifiers: return Device(uuid=index_or_uuid) raise ValueError('invalid identifier') def strip_identifier(identifier: str) -> str: identifier = identifier.strip() if len(identifier) > 0 and ( identifier[0].isdigit() or (len(identifier) > 1 and identifier[0] in {'+', '-'} and identifier[1].isdigit()) ): offset = 1 if identifier[0] in {'+', '-'} else 0 while offset < len(identifier) and identifier[offset].isdigit(): offset += 1 identifier = identifier[:offset] return identifier for identifier in map(strip_identifier, cuda_visible_devices.split(',')): if identifier in presented: return [] # duplicate identifiers found try: device = from_index_or_uuid(identifier) except (ValueError, libnvml.NVMLError): break devices.append(device) presented.add(identifier) mig_devices = [device for device in devices if device.is_mig_device()] if len(mig_devices) > 0: # Got MIG devices enumerated, use the first one devices = mig_devices[:1] # at most one MIG device is visible else: # All devices in `CUDA_VISIBLE_DEVICES` are physical devices # Check if any GPU that enables MIG mode devices_backup = devices.copy() devices = [] for device in devices_backup: if device.is_mig_mode_enabled(): # Got available MIG devices, use the first MIG device and ignore all non-MIG GPUs try: devices = [device.mig_device(mig_index=0)] # at most one MIG device is visible except libnvml.NVMLError: continue # no MIG device available on the GPU else: break # got one MIG device else: devices.append(device) # non-MIG device if format == 'uuid': return [device.uuid() for device in devices] return [device.index for device in devices] # type: ignore[return-value] def _parse_cuda_visible_devices_to_uuids( cuda_visible_devices: str | None = _VALUE_OMITTED, verbose: bool = True, ) -> list[str]: """Parse the given ``CUDA_VISIBLE_DEVICES`` environment variable in a separate process and return a list of device UUIDs. The UUIDs do not have a prefix ``GPU-`` or ``MIG-``. Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. verbose (bool): Whether to raise an exception in the subprocess if failed to parse the ``CUDA_VISIBLE_DEVICES``. Returns: List[str] A list of device UUIDs without ``GPU-`` or ``MIG-`` prefixes. Raises: libcuda.CUDAError_NotInitialized: If cannot found the CUDA driver libraries. libcuda.CUDAError: If failed to parse the ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) # Do not inherit file descriptors and handles from the parent process # The `fork` start method should be considered unsafe as it can lead to crashes of the subprocess ctx = mp.get_context('spawn') queue = ctx.SimpleQueue() try: parser = ctx.Process( target=_cuda_visible_devices_parser, args=(cuda_visible_devices, queue, verbose), name='`CUDA_VISIBLE_DEVICES` parser', daemon=True, ) parser.start() parser.join() finally: parser.kill() result = queue.get() if isinstance(result, Exception): raise result return result def _cuda_visible_devices_parser( cuda_visible_devices: str | None, queue: mp.SimpleQueue, verbose: bool = True, ) -> None: try: if cuda_visible_devices is not None: os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices else: os.environ.pop('CUDA_VISIBLE_DEVICES', None) # pylint: disable=no-member try: libcuda.cuInit() except ( libcuda.CUDAError_NoDevice, libcuda.CUDAError_InvalidDevice, libcuda.CUDAError_SystemDriverMismatch, libcuda.CUDAError_CompatNotSupportedOnDevice, ): queue.put([]) raise count = libcuda.cuDeviceGetCount() uuids = [libcuda.cuDeviceGetUuid(libcuda.cuDeviceGet(i)) for i in range(count)] except Exception as ex: # pylint: disable=broad-except queue.put(ex) if verbose: raise else: queue.put(uuids) return finally: # Ensure non-empty queue queue.put(libcuda.CUDAError_NotInitialized()) # pylint: disable=no-member nvitop-1.4.2/nvitop/api/host.py000066400000000000000000000077031474547113600164770ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Shortcuts for package ``psutil``. ``psutil`` is a cross-platform library for retrieving information on running processes and system utilization (CPU, memory, disks, network, sensors) in Python. """ from __future__ import annotations import os as _os from typing import TYPE_CHECKING as _TYPE_CHECKING import psutil as _psutil from psutil import * # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin if _TYPE_CHECKING: from collections.abc import Callable as _Callable __all__ = [name for name in _psutil.__all__ if not name.startswith('_')] + [ 'getuser', 'hostname', 'load_average', 'uptime', 'memory_percent', 'swap_percent', 'ppid_map', 'reverse_ppid_map', 'WSL', 'WINDOWS_SUBSYSTEM_FOR_LINUX', ] __all__[__all__.index('Error')] = 'PsutilError' PsutilError = Error = _psutil.Error # make alias del Error cpu_percent = _psutil.cpu_percent virtual_memory = _psutil.virtual_memory swap_memory = _psutil.swap_memory Process = _psutil.Process NoSuchProcess = _psutil.NoSuchProcess ZombieProcess = _psutil.ZombieProcess AccessDenied = _psutil.AccessDenied POSIX = _psutil.POSIX WINDOWS = _psutil.WINDOWS LINUX = _psutil.LINUX MACOS = _psutil.MACOS def getuser() -> str: """Get the current username from the environment or password database.""" import getpass # pylint: disable=import-outside-toplevel try: return getpass.getuser() except (ModuleNotFoundError, OSError): return _os.getlogin() def hostname() -> str: """Get the hostname of the machine.""" import platform # pylint: disable=import-outside-toplevel return platform.node() if hasattr(_psutil, 'getloadavg'): def load_average() -> tuple[float, float, float]: """Get the system load average.""" return _psutil.getloadavg() else: def load_average() -> None: # type: ignore[misc] """Get the system load average.""" return def uptime() -> float: """Get the system uptime.""" import time as _time # pylint: disable=import-outside-toplevel return _time.time() - _psutil.boot_time() def memory_percent() -> float: """The percentage usage of virtual memory, calculated as ``(total - available) / total * 100``.""" return virtual_memory().percent def swap_percent() -> float: """The percentage usage of virtual memory, calculated as ``used / total * 100``.""" return swap_memory().percent ppid_map: _Callable[[], dict[int, int]] = _psutil._ppid_map # pylint: disable=protected-access """Obtain a ``{pid: ppid, ...}`` dict for all running processes in one shot.""" def reverse_ppid_map() -> dict[int, list[int]]: # pylint: disable=function-redefined """Obtain a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot.""" from collections import defaultdict # pylint: disable=import-outside-toplevel tree = defaultdict(list) for pid, ppid in ppid_map().items(): tree[ppid].append(pid) return tree if LINUX: WSL = _os.getenv('WSL_DISTRO_NAME', default=None) if WSL is not None and WSL == '': WSL = 'WSL' else: WSL = None WINDOWS_SUBSYSTEM_FOR_LINUX = WSL """The Linux distribution name of the Windows Subsystem for Linux.""" nvitop-1.4.2/nvitop/api/libcuda.py000066400000000000000000001132641474547113600171250ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Python bindings for the `CUDA Driver APIs `_.""" # pylint: disable=invalid-name from __future__ import annotations import ctypes as _ctypes import itertools as _itertools import platform as _platform import string as _string import sys as _sys import threading as _threading from typing import TYPE_CHECKING as _TYPE_CHECKING from typing import Any as _Any from typing import ClassVar as _ClassVar if _TYPE_CHECKING: from collections.abc import Callable as _Callable from typing_extensions import Self as _Self # Python 3.11+ from typing_extensions import TypeAlias as _TypeAlias # Python 3.10+ # pylint: disable-next=missing-class-docstring,too-few-public-methods class _struct_c_CUdevice_t(_ctypes.Structure): pass # opaque handle _c_CUdevice_t: _TypeAlias = _ctypes.POINTER( # type: ignore[valid-type] # noqa: PYI042 _struct_c_CUdevice_t, ) _CUresult_t: _TypeAlias = _ctypes.c_uint # Error codes # # pylint: disable=line-too-long CUDA_SUCCESS = 0 """The API call returned with no errors. In the case of query calls, this also means that the operation being queried is complete (see :func:`cuEventQuery` and :func:`cuStreamQuery`).""" CUDA_ERROR_INVALID_VALUE = 1 """This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.""" CUDA_ERROR_OUT_OF_MEMORY = 2 """The API call failed because it was unable to allocate enough memory to perform the requested operation.""" CUDA_ERROR_NOT_INITIALIZED = 3 """This indicates that the CUDA driver has not been initialized with :func:`cuInit` or that initialization has failed.""" CUDA_ERROR_DEINITIALIZED = 4 """This indicates that the CUDA driver is in the process of shutting down.""" CUDA_ERROR_PROFILER_DISABLED = 5 """This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler.""" CUDA_ERROR_STUB_LIBRARY = 34 """This indicates that the CUDA driver that the application has loaded is a stub library. Applications that run with the stub rather than a real driver loaded will result in CUDA API returning this error.""" CUDA_ERROR_DEVICE_UNAVAILABLE = 46 """This indicates that requested CUDA device is unavailable at the current time. Devices are often unavailable due to use of :data:`CU_COMPUTEMODE_EXCLUSIVE_PROCESS` or :data:`CU_COMPUTEMODE_PROHIBITED`.""" CUDA_ERROR_NO_DEVICE = 100 """This indicates that no CUDA - capable devices were detected by the installed CUDA driver.""" CUDA_ERROR_INVALID_DEVICE = 101 """This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device.""" CUDA_ERROR_DEVICE_NOT_LICENSED = 102 """This error indicates that the Grid license is not applied.""" CUDA_ERROR_INVALID_IMAGE = 200 """This indicates that the device kernel image is invalid. This can also indicate an invalid CUDA module.""" CUDA_ERROR_INVALID_CONTEXT = 201 """This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :func:`cuCtxDestroy` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See :func:`cuCtxGetApiVersion` for more details.""" CUDA_ERROR_MAP_FAILED = 205 """This indicates that a map or register operation has failed.""" CUDA_ERROR_UNMAP_FAILED = 206 """This indicates that an unmap or unregister operation has failed.""" CUDA_ERROR_ARRAY_IS_MAPPED = 207 """This indicates that the specified array is currently mapped and thus cannot be destroyed.""" CUDA_ERROR_ALREADY_MAPPED = 208 """This indicates that the resource is already mapped.""" CUDA_ERROR_NO_BINARY_FOR_GPU = 209 """This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.""" CUDA_ERROR_ALREADY_ACQUIRED = 210 """This indicates that a resource has already been acquired.""" CUDA_ERROR_NOT_MAPPED = 211 """This indicates that a resource is not mapped.""" CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212 """This indicates that a mapped resource is not available for access as an array.""" CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213 """This indicates that a mapped resource is not available for access as a pointer.""" CUDA_ERROR_ECC_UNCORRECTABLE = 214 """This indicates that an uncorrectable ECC error was detected during execution.""" CUDA_ERROR_UNSUPPORTED_LIMIT = 215 """This indicates that the :class:`CUlimit` passed to the API call is not supported by the active device.""" CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216 """This indicates that the :class:`CUcontext` passed to the API call can only be bound to a single CPU thread at a time but is already bound to a CPU thread.""" CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217 """This indicates that peer access is not supported across the given devices.""" CUDA_ERROR_INVALID_PTX = 218 """This indicates that a PTX JIT compilation failed.""" CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219 """This indicates an error with OpenGL or DirectX context.""" CUDA_ERROR_NVLINK_UNCORRECTABLE = 220 """This indicates that an uncorrectable NVLink error was detected during the execution.""" CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221 """This indicates that the PTX JIT compiler library was not found.""" CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222 """This indicates that the provided PTX was compiled with an unsupported toolchain.""" CUDA_ERROR_JIT_COMPILATION_DISABLED = 223 """This indicates that the PTX JIT compilation was disabled.""" CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224 """This indicates that the :class:`CUexecAffinityType` passed to the API call is not supported by the active device.""" CUDA_ERROR_INVALID_SOURCE = 300 """This indicates that the device kernel source is invalid. This includes compilation / linker errors encountered in device code or user error.""" CUDA_ERROR_FILE_NOT_FOUND = 301 """This indicates that the file specified was not found.""" CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302 """This indicates that a link to a shared object failed to resolve.""" CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303 """This indicates that initialization of a shared object failed.""" CUDA_ERROR_OPERATING_SYSTEM = 304 """This indicates that an OS call failed.""" CUDA_ERROR_INVALID_HANDLE = 400 """This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like :class:`CUstream` and :class:`CUevent`.""" CUDA_ERROR_ILLEGAL_STATE = 401 """This indicates that a resource required by the API call is not in a valid state to perform the requested operation.""" CUDA_ERROR_NOT_FOUND = 500 """This indicates that a named symbol was not found. Examples of symbols are global / constant variable names, driver function names, texture names, and surface names.""" CUDA_ERROR_NOT_READY = 600 """This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than :data:`CUDA_SUCCESS` (which indicates completion). Calls that may return this value include :func:`cuEventQuery` and :func:`cuStreamQuery`.""" CUDA_ERROR_ILLEGAL_ADDRESS = 700 """While executing a kernel, the device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701 """This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64 - bit pointer when a 32 - bit int is expected) is equivalent to passing too many arguments and can also result in this error.""" CUDA_ERROR_LAUNCH_TIMEOUT = 702 """This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute :data:`CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT` for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703 """This error indicates a kernel launch that uses an incompatible texturing mode.""" CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704 """This error indicates that a call to :func:`cuCtxEnablePeerAccess` is trying to re - enable peer access to a context which has already had peer access to it enabled.""" CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705 """This error indicates that :func:`cuCtxDisablePeerAccess` is trying to disable peer access which has not been enabled yet via :func:`cuCtxEnablePeerAccess`.""" CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708 """This error indicates that the primary context for the specified device has already been initialized.""" CUDA_ERROR_CONTEXT_IS_DESTROYED = 709 """This error indicates that the context current to the calling thread has been destroyed using :func:`cuCtxDestroy`, or is a primary context which has not yet been initialized.""" CUDA_ERROR_ASSERT = 710 """A device - side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.""" CUDA_ERROR_TOO_MANY_PEERS = 711 """This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to :func:`cuCtxEnablePeerAccess`.""" CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712 """This error indicates that the memory range passed to :func:`cuMemHostRegister` has already been registered.""" CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713 """This error indicates that the pointer passed to :func:`cuMemHostUnregister` does not correspond to any currently registered memory region.""" CUDA_ERROR_HARDWARE_STACK_ERROR = 714 """While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_ILLEGAL_INSTRUCTION = 715 """While executing a kernel, the device encountered an illegal instruction. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_MISALIGNED_ADDRESS = 716 """While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_INVALID_ADDRESS_SPACE = 717 """While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_INVALID_PC = 718 """While executing a kernel, the device program counter wrapped its address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_LAUNCH_FAILED = 719 """An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. Less common cases can be system specific - more information about these cases can be found in the system specific user guide. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720 """This error indicates that the number of blocks launched per grid for a kernel that was launched via either :func:`cuLaunchCooperativeKernel` or :func:`cuLaunchCooperativeKernelMultiDevice` exceeds the maximum number of blocks as allowed by :func:`cuOccupancyMaxActiveBlocksPerMultiprocessor` or :func:`cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :data:`CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`.""" CUDA_ERROR_NOT_PERMITTED = 800 """This error indicates that the attempted operation is not permitted.""" CUDA_ERROR_NOT_SUPPORTED = 801 """This error indicates that the attempted operation is not supported on the current system or device.""" CUDA_ERROR_SYSTEM_NOT_READY = 802 """This error indicates that the system is not yet ready to start any CUDA work. To continue using CUDA, verify the system configuration is in a valid state and all required driver daemons are actively running. More information about this error can be found in the system specific user guide.""" CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803 """This error indicates that there is a mismatch between the versions of the display driver and the CUDA driver. Refer to the compatibility documentation for supported versions.""" CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804 """This error indicates that the system was upgraded to run with forward compatibility but the visible hardware detected by CUDA does not support this configuration. Refer to the compatibility documentation for the supported hardware matrix or ensure that only supported hardware is visible during initialization via the ``CUDA_VISIBLE_DEVICES`` environment variable.""" CUDA_ERROR_MPS_CONNECTION_FAILED = 805 """This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.""" CUDA_ERROR_MPS_RPC_FAILURE = 806 """This error indicates that the remote procedural call between the MPS server and the MPS client failed.""" CUDA_ERROR_MPS_SERVER_NOT_READY = 807 """This error indicates that the MPS server is not ready to accept new MPS client requests. This error can be returned when the MPS server is in the process of recovering from a fatal failure.""" CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808 """This error indicates that the hardware resources required to create MPS client have been exhausted.""" CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809 """This error indicates the the hardware resources required to support device connections have been exhausted.""" CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 """This error indicates that the operation is not permitted when the stream is capturing.""" CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 """This error indicates that the current capture sequence on the stream has been invalidated due to a previous error.""" CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 """This error indicates that the operation would have resulted in a merge of two independent capture sequences.""" CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903 """This error indicates that the capture was initiated not in this stream.""" CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904 """This error indicates that the capture sequence contains a fork that was not joined to the primary stream.""" CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905 """This error indicates that a dependency would have been created which crosses the capture sequence boundary. Only implicit in -stream ordering dependencies are allowed to cross the boundary.""" CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906 """This error indicates a disallowed implicit dependency on a current capture sequence from :func:`cudaStreamLegacy`.""" CUDA_ERROR_CAPTURED_EVENT = 907 """This error indicates that the operation is not permitted on an event which was last recorded in a capturing stream.""" CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 """A stream capture sequence not initiated with the :data:`CU_STREAM_CAPTURE_MODE_RELAXED` argument to :func:`cuStreamBeginCapture` was passed to :func:`cuStreamEndCapture` in a different thread.""" CUDA_ERROR_TIMEOUT = 909 """This error indicates that the timeout specified for the wait operation has lapsed.""" CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910 """This error indicates that the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.""" CUDA_ERROR_EXTERNAL_DEVICE = 911 """This indicates that an async error has occurred in a device outside of CUDA. If CUDA was waiting for an external device's signal before consuming shared data, the external device signaled an error indicating that the data is not valid for consumption. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" CUDA_ERROR_UNKNOWN = 999 """This indicates that an unknown internal error has occurred.""" # pylint: enable=line-too-long # Error Checking # class CUDAError(Exception): """Base exception class for CUDA driver query errors.""" _value_class_mapping: _ClassVar[dict[int, type[CUDAError]]] = {} _errcode_to_string: _ClassVar[dict[int, str]] = { # List of currently known error codes CUDA_ERROR_NOT_INITIALIZED: 'Initialization error.', CUDA_ERROR_NOT_FOUND: 'Named symbol not found.', CUDA_ERROR_INVALID_VALUE: 'Invalid argument.', CUDA_ERROR_NO_DEVICE: 'No CUDA-capable device is detected.', CUDA_ERROR_INVALID_DEVICE: 'Invalid device ordinal.', CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: 'System has unsupported display driver / CUDA driver combination.', CUDA_ERROR_DEINITIALIZED: 'Driver shutting down.', CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: 'Forward compatibility was attempted on non supported Hardware.', CUDA_ERROR_INVALID_CONTEXT: 'Invalid device context.', } # fmt:skip _errcode_to_name: _ClassVar[dict[int, str]] = {} value: int def __new__(cls, value: int) -> _Self: """Map value to a proper subclass of :class:`CUDAError`.""" if cls is CUDAError: # pylint: disable-next=self-cls-assignment cls = CUDAError._value_class_mapping.get(value, cls) # type: ignore[assignment] obj = Exception.__new__(cls) obj.value = value return obj def __repr__(self) -> str: """Return a string representation of the error.""" # pylint: disable=no-member try: if self.value not in CUDAError._errcode_to_string: CUDAError._errcode_to_string[self.value] = '{}.'.format( cuGetErrorString(self.value).rstrip('.').capitalize(), ) if self.value not in CUDAError._errcode_to_name: CUDAError._errcode_to_name[self.value] = cuGetErrorName(self.value) return ( f'{CUDAError._errcode_to_string[self.value]} ' f'Code: {CUDAError._errcode_to_name[self.value]} ({self.value}).' ) except CUDAError: return f'CUDA Error with code {self.value}.' def __eq__(self, other: object) -> bool: """Test equality to other object.""" if not isinstance(other, CUDAError): return NotImplemented return self.value == other.value # pylint: disable=no-member def __reduce__(self) -> tuple[type[CUDAError], tuple[int]]: """Return state information for pickling.""" return CUDAError, (self.value,) # pylint: disable=no-member def cudaExceptionClass(cudaErrorCode: int) -> type[CUDAError]: """Map value to a proper subclass of :class:`CUDAError`. Raises: ValueError: If the error code is not valid. """ if cudaErrorCode not in CUDAError._value_class_mapping: # pylint: disable=protected-access raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.') return CUDAError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access def _extract_cuda_errors_as_classes() -> None: """Generate a hierarchy of classes on top of :class:`CUDAError` class. Each CUDA Error gets a new :class:`CUDAError` subclass. This way try-except blocks can filter appropriate exceptions more easily. :class:`CUDAError` is a parent class. Each ``CUDA_ERROR_*`` gets it's own subclass. e.g. :data:`CUDA_ERROR_INVALID_VALUE` will be turned into :class:`CUDAError_InvalidValue`. """ this_module = _sys.modules[__name__] cuda_error_names = [x for x in dir(this_module) if x.startswith('CUDA_ERROR_')] for err_name in cuda_error_names: # e.g. Turn CUDA_ERROR_INVALID_VALUE into CUDAError_InvalidValue pascal_case = _string.capwords(err_name.replace('CUDA_ERROR_', ''), '_').replace('_', '') class_name = f'CUDAError_{pascal_case}' err_val = getattr(this_module, err_name) def gen_new(value: int) -> _Callable[[type[CUDAError]], CUDAError]: def new(cls: type[CUDAError]) -> CUDAError: return CUDAError.__new__(cls, value) return new # pylint: disable=protected-access new_error_class = type(class_name, (CUDAError,), {'__new__': gen_new(err_val)}) new_error_class.__module__ = __name__ if err_val in CUDAError._errcode_to_string: new_error_class.__doc__ = ( f'CUDA Error: {CUDAError._errcode_to_string[err_val]} ' f'Code: :data:`{err_name}` ({err_val}).' ) else: new_error_class.__doc__ = f'CUDA Error with code :data:`{err_name}` ({err_val})' setattr(this_module, class_name, new_error_class) CUDAError._value_class_mapping[err_val] = new_error_class CUDAError._errcode_to_name[err_val] = err_name # Add explicit references to appease linters class __CUDAError(CUDAError): value: int def __new__(cls) -> CUDAError: # type: ignore[misc,empty-body] ... CUDAError_NotInitialized: type[__CUDAError] CUDAError_NotFound: type[__CUDAError] CUDAError_InvalidValue: type[__CUDAError] CUDAError_NoDevice: type[__CUDAError] CUDAError_InvalidDevice: type[__CUDAError] CUDAError_SystemDriverMismatch: type[__CUDAError] CUDAError_Deinitialized: type[__CUDAError] CUDAError_CompatNotSupportedOnDevice: type[__CUDAError] CUDAError_InvalidContext: type[__CUDAError] _extract_cuda_errors_as_classes() del _extract_cuda_errors_as_classes def _cudaCheckReturn(ret: _Any) -> _Any: if ret != CUDA_SUCCESS: raise CUDAError(ret) return ret # Function access # __cudaLib: _ctypes.CDLL | None = None __initialized: bool = False __libLoadLock: _threading.Lock = _threading.Lock() # Function pointers are cached to prevent unnecessary libLoadLock locking __cudaGetFunctionPointer_cache: dict[str, _ctypes._CFuncPtr] = {} # type: ignore[name-defined] def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined] """Get the function pointer from the CUDA driver library. Raises: CUDAError_NotInitialized: If cannot found the CUDA driver library. CUDAError_NotFound: If cannot found the function pointer. """ if name in __cudaGetFunctionPointer_cache: return __cudaGetFunctionPointer_cache[name] with __libLoadLock: # Ensure library was loaded if __cudaLib is None: raise CUDAError(CUDA_ERROR_NOT_INITIALIZED) try: __cudaGetFunctionPointer_cache[name] = getattr(__cudaLib, name) return __cudaGetFunctionPointer_cache[name] except AttributeError as ex: raise CUDAError(CUDA_ERROR_NOT_FOUND) from ex def __LoadCudaLibrary() -> None: """Load the library if it isn't loaded already. Raises: CUDAError_NotInitialized: If cannot found the CUDA driver library. """ global __cudaLib # pylint: disable=global-statement if __cudaLib is None: # Lock to ensure only one caller loads the library with __libLoadLock: # Ensure the library still isn't loaded if __cudaLib is None: # Platform specific libcuda location system = _platform.system() if system == 'Darwin': lib_filenames = [ 'libcuda.1.dylib', # check library path first 'libcuda.dylib', '/usr/local/cuda/lib/libcuda.1.dylib', '/usr/local/cuda/lib/libcuda.dylib', ] elif system == 'Linux': lib_filenames = [ 'libcuda.so', # check library path first '/usr/lib64/nvidia/libcuda.so', # Redhat/CentOS/Fedora '/usr/lib/x86_64-linux-gnu/libcuda.so', # Ubuntu '/usr/lib/wsl/lib/libcuda.so', # WSL ] # Also add libraries with version suffix `.1` lib_filenames = list( _itertools.chain.from_iterable((f'{lib}.1', lib) for lib in lib_filenames), ) elif system == 'Windows': bits = 8 * _ctypes.sizeof(_ctypes.c_void_p) # 64 or 32 lib_filenames = [f'nvcuda{bits}.dll', 'nvcuda.dll'] # Open library for lib_filename in lib_filenames: try: __cudaLib = _ctypes.CDLL(lib_filename) break except OSError: pass if __cudaLib is None: _cudaCheckReturn(CUDA_ERROR_NOT_INITIALIZED) def cuInit(flags: int = 0) -> None: """Initialize the CUDA driver API. Initialize the driver API and must be called before any other function from the driver API. Currently, the ``flags`` parameter must be :data:`0`. If :func:`cuInit` has not been called, any function from the driver API will return :data:`CUDA_ERROR_NOT_INITIALIZED`. Raises: CUDAError_NoDevice: If no CUDA-capable device is available. CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_SystemDriverMismatch: If there is a mismatch between the versions of the display driver and the CUDA driver. CUDAError_CompatNotSupportedOnDevice: If the system was upgraded to run with forward compatibility but the visible hardware detected by CUDA does not support this configuration. CUDAError_InvalidValue: If passed with invalid flag value. CUDAError_NotInitialized: If cannot found the CUDA driver library. """ global __initialized # pylint: disable=global-statement __LoadCudaLibrary() if __initialized: return fn = __cudaGetFunctionPointer('cuInit') ret = fn(_ctypes.c_uint(flags)) _cudaCheckReturn(ret) with __libLoadLock: __initialized = True def cuGetErrorName(error: int) -> str: """Get the string representation of an error code enum name. Raises: CUDAError_InvalidValue: If the error code is not recognized. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuGetErrorName') p_name = _ctypes.POINTER(_ctypes.c_char_p)() ret = fn(_CUresult_t(error), _ctypes.byref(p_name)) _cudaCheckReturn(ret) name = _ctypes.string_at(p_name) return name.decode('utf-8', errors='replace') def cuGetErrorString(error: int) -> str: """Get the string description of an error code. Raises: CUDAError_InvalidValue: If the error code is not recognized. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuGetErrorString') p_name = _ctypes.POINTER(_ctypes.c_char_p)() ret = fn(_CUresult_t(error), _ctypes.byref(p_name)) _cudaCheckReturn(ret) name = _ctypes.string_at(p_name) return name.decode('utf-8', errors='replace') def cuDriverGetVersion() -> str: """Get the latest CUDA version supported by driver. Returns: A string of the form :data:`'.'`. Raises: CUDAError_InvalidValue: If the driver call fails. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDriverGetVersion') driver_version = _ctypes.c_int() ret = fn(_ctypes.byref(driver_version)) _cudaCheckReturn(ret) major = driver_version.value // 1000 minor = (driver_version.value % 1000) // 10 return f'{major}.{minor}' def cuDeviceGetCount() -> int: """Get the number of compute-capable devices. Returns: int The number of devices with compute capability greater than or equal to 2.0 that are available for execution. If there is no such device, :func:`cuDeviceGetCount` returns :data:`0`. Raises: CUDAError_InvalidContext: If there is no context bound to the current thread. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGetCount') count = _ctypes.c_int(0) ret = fn(_ctypes.byref(count)) _cudaCheckReturn(ret) return count.value def cuDeviceGet(ordinal: int) -> _c_CUdevice_t: """Get a handle to a compute device. Returns: A device handle given an ordinal in the range :code:`[0, ..., cuDeviceGetCount() - 1]`. Raises: CUDAError_InvalidContext: If there is no context bound to the current thread. CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGet') device = _c_CUdevice_t() ret = fn(_ctypes.byref(device), _ctypes.c_int(ordinal)) _cudaCheckReturn(ret) return device def cuDeviceGetByPCIBusId(pciBusId: str) -> _c_CUdevice_t: """Get a handle to a compute device. Args: pciBusId (str): String in one of the following forms: ``[domain]:[bus]:[device].[function]``, ``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``, ``device``, and ``function`` are all hexadecimal values. Returns: int A device handle given a PCI bus ID string. Raises: CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the value of :data:`pciBusId` is not a valid PCI bus identifier. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGetByPCIBusId') device = _c_CUdevice_t() ret = fn(_ctypes.byref(device), _ctypes.c_char_p(pciBusId.encode('utf-8'))) _cudaCheckReturn(ret) return device def cuDeviceGetPCIBusId(device: _c_CUdevice_t) -> str: """Get a PCI Bus Id string for the device. Returns: str An identifier string for the device in the following format ``[domain]:[bus]:[device].[function]`` where ``domain``, ``bus``, ``device``, and ``function`` are all hexadecimal values. Raises: CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGetPCIBusId') pciBusId = _ctypes.create_string_buffer(256) ret = fn(pciBusId, _ctypes.c_int(256), device) _cudaCheckReturn(ret) return pciBusId.value.decode('utf-8', errors='replace') def cuDeviceGetName(device: _c_CUdevice_t) -> str: """Get an identifier string for the device. Returns: str An ASCII string identifying the device. Raises: CUDAError_InvalidContext: If there is no context bound to the current thread. CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGetName') name = _ctypes.create_string_buffer(256) ret = fn(name, _ctypes.c_int(256), device) _cudaCheckReturn(ret) return name.value.decode('utf-8', errors='replace') def cuDeviceGetUuid(device: _c_CUdevice_t) -> str: """Get a UUID for the device. Raises: CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ try: fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2') except CUDAError_NotFound: # noqa: F821 # pylint: disable=undefined-variable fn = __cudaGetFunctionPointer('cuDeviceGetUuid') uuid = (_ctypes.c_ubyte * 16)() ret = fn(uuid, device) _cudaCheckReturn(ret) uuid = ''.join(map('{:02x}'.format, uuid)) return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])) def cuDeviceGetUuid_v2(device: _c_CUdevice_t) -> str: """Get a UUID for the device (CUDA 11.4+). Raises: CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2') uuid = (_ctypes.c_ubyte * 16)() ret = fn(uuid, device) _cudaCheckReturn(ret) uuid = ''.join(map('{:02x}'.format, uuid)) return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])) def cuDeviceTotalMem(device: _c_CUdevice_t) -> int: """Get the total amount of memory on the device (in bytes). Raises: CUDAError_InvalidContext: If there is no context bound to the current thread. CUDAError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device. CUDAError_InvalidValue: If the driver call fails. CUDAError_Deinitialized: If the CUDA driver in the process is shutting down. CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ fn = __cudaGetFunctionPointer('cuDeviceTotalMem') bytes = _ctypes.c_size_t() # pylint: disable=redefined-builtin ret = fn(_ctypes.byref(bytes), device) _cudaCheckReturn(ret) return bytes.value def is_available() -> bool: """Test whether there are any CUDA visible devices.""" try: return cuDeviceGetCount() > 0 except CUDAError: return False nvitop-1.4.2/nvitop/api/libcudart.py000066400000000000000000001254561474547113600175010ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Python bindings for the `CUDA Runtime APIs `_.""" # pylint: disable=invalid-name from __future__ import annotations import ctypes as _ctypes import glob as _glob import os as _os import platform as _platform import sys as _sys import threading as _threading from typing import TYPE_CHECKING as _TYPE_CHECKING from typing import Any as _Any from typing import ClassVar as _ClassVar if _TYPE_CHECKING: from collections.abc import Callable as _Callable from typing_extensions import Self as _Self # Python 3.11+ _cudaError_t = _ctypes.c_int # Error codes # # pylint: disable=line-too-long cudaSuccess = 0 """The API call returned with no errors. In the case of query calls, this also means that the operation being queried is complete (see :func:`cudaEventQuery` and :func:`cudaStreamQuery`).""" cudaErrorInvalidValue = 1 """This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.""" cudaErrorMemoryAllocation = 2 """The API call failed because it was unable to allocate enough memory to perform the requested operation.""" cudaErrorInitializationError = 3 """The API call failed because the CUDA driver and runtime could not be initialized.""" cudaErrorCudartUnloading = 4 """This indicates that a CUDA Runtime API call cannot be executed because it is being called during process shut down, at a point in time after CUDA driver has been unloaded.""" cudaErrorProfilerDisabled = 5 """This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler.""" cudaErrorInvalidConfiguration = 9 """This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.""" cudaErrorInvalidPitchValue = 12 """This indicates that one or more of the pitch-related parameters passed to the API call is not within the acceptable range for pitch.""" cudaErrorInvalidSymbol = 13 """This indicates that the symbol name / identifier passed to the API call is not a valid name or identifier.""" cudaErrorInvalidTexture = 18 """This indicates that the texture passed to the API call is not a valid texture.""" cudaErrorInvalidTextureBinding = 19 """This indicates that the texture binding is not valid. This occurs if you call :func:`cudaGetTextureAlignmentOffset` with an unbound texture.""" cudaErrorInvalidChannelDescriptor = 20 """This indicates that the channel descriptor passed to the API call is not valid. This occurs if the format is not one of the formats specified by :data:`cudaChannelFormatKind`, or if one of the dimensions is invalid.""" cudaErrorInvalidMemcpyDirection = 21 """This indicates that the direction of the :func:`memcpy` passed to the API call is not one of the types specified by :data:`cudaMemcpyKind`.""" cudaErrorInvalidFilterSetting = 26 """This indicates that a non-float texture was being accessed with linear filtering. This is not supported by CUDA.""" cudaErrorInvalidNormSetting = 27 """This indicates that an attempt was made to read a non-float texture as a normalized float. This is not supported by CUDA.""" cudaErrorStubLibrary = 34 """This indicates that the CUDA driver that the application has loaded is a stub library. Applications that run with the stub rather than a real driver loaded will result in CUDA API returning this error.""" cudaErrorInsufficientDriver = 35 """This indicates that the installed NVIDIA CUDA driver is older than the CUDA Runtime library. This is not a supported configuration. Users should install an updated NVIDIA display driver to allow the application to run.""" cudaErrorCallRequiresNewerDriver = 36 """This indicates that the API call requires a newer CUDA driver than the one currently installed. Users should install an updated NVIDIA CUDA driver to allow the API call to succeed.""" cudaErrorInvalidSurface = 37 """This indicates that the surface passed to the API call is not a valid surface.""" cudaErrorDuplicateVariableName = 43 """This indicates that multiple global or constant variables (across separate CUDA source files in the application) share the same string name.""" cudaErrorDuplicateTextureName = 44 """This indicates that multiple textures (across separate CUDA source files in the application) share the same string name.""" cudaErrorDuplicateSurfaceName = 45 """This indicates that multiple surfaces (across separate CUDA source files in the application) share the same string name.""" cudaErrorDevicesUnavailable = 46 """This indicates that all CUDA devices are busy or unavailable at the current time. Devices are often busy / unavailable due to use of :data:`cudaComputeModeProhibited`, :data:`cudaComputeModeExclusiveProcess`, or when long running CUDA kernels have filled up the GPU and are blocking new work from starting. They can also be unavailable due to memory constraints on a device that already has active CUDA work being performed.""" cudaErrorIncompatibleDriverContext = 49 """This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime / Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed.""" cudaErrorMissingConfiguration = 52 """The device function being invoked (usually via :func:`cudaLaunchKernel`) was not previously configured via the :func:`cudaConfigureCall` function.""" cudaErrorLaunchMaxDepthExceeded = 65 """This error indicates that a device runtime grid launch did not occur because the depth of the child grid would exceed the maximum supported number of nested grid launches.""" cudaErrorLaunchFileScopedTex = 66 """This error indicates that a grid launch did not occur because the kernel uses file-scoped textures which are unsupported by the device runtime. Kernels launched via the device runtime only support textures created with the Texture Object API's.""" cudaErrorLaunchFileScopedSurf = 67 """This error indicates that a grid launch did not occur because the kernel uses file-scoped surfaces which are unsupported by the device runtime. Kernels launched via the device runtime only support surfaces created with the Surface Object API's.""" cudaErrorSyncDepthExceeded = 68 """This error indicates that a call to :func:`cudaDeviceSynchronize` made from the device runtime failed because the call was made at grid depth greater than than either the default (2 levels of grids) or user specified device limit :data:`cudaLimitDevRuntimeSyncDepth`. To be able to synchronize on launched grids at a greater depth successfully, the maximum nested depth at which :func:`cudaDeviceSynchronize` will be called must be specified with the :data:`cudaLimitDevRuntimeSyncDepth` limit to the :func:`cudaDeviceSetLimit` api before the host-side launch of a kernel using the device runtime. Keep in mind that additional levels of sync depth require the runtime to reserve large amounts of device memory that cannot be used for user allocations. Note that :func:`cudaDeviceSynchronize` made from device runtime is only supported on devices of compute capability < 9.0.""" cudaErrorLaunchPendingCountExceeded = 69 """This error indicates that a device runtime grid launch failed because the launch would exceed the limit :data:`cudaLimitDevRuntimePendingLaunchCount`. For this launch to proceed successfully, :func:`cudaDeviceSetLimit` must be called to set the :data:`cudaLimitDevRuntimePendingLaunchCount` to be higher than the upper bound of outstanding launches that can be issued to the device runtime. Keep in mind that raising the limit of pending device runtime launches will require the runtime to reserve device memory that cannot be used for user allocations.""" cudaErrorInvalidDeviceFunction = 98 """The requested device function does not exist or is not compiled for the proper device architecture.""" cudaErrorNoDevice = 100 """This indicates that no CUDA-capable devices were detected by the installed CUDA driver.""" cudaErrorInvalidDevice = 101 """This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device or that the action requested is invalid for the specified device.""" cudaErrorDeviceNotLicensed = 102 """This indicates that the device doesn't have a valid Grid License.""" cudaErrorSoftwareValidityNotEstablished = 103 """By default, the CUDA Runtime may perform a minimal set of self-tests, as well as CUDA driver tests, to establish the validity of both. Introduced in CUDA 11.2, this error return indicates that at least one of these tests has failed and the validity of either the runtime or the driver could not be established.""" cudaErrorStartupFailure = 127 """This indicates an internal startup failure in the CUDA Runtime.""" cudaErrorInvalidKernelImage = 200 """This indicates that the device kernel image is invalid.""" cudaErrorDeviceUninitialized = 201 """This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :func`cuCtxDestroy` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls).""" cudaErrorMapBufferObjectFailed = 205 """This indicates that the buffer object could not be mapped.""" cudaErrorUnmapBufferObjectFailed = 206 """This indicates that the buffer object could not be unmapped.""" cudaErrorArrayIsMapped = 207 """This indicates that the specified array is currently mapped and thus cannot be destroyed.""" cudaErrorAlreadyMapped = 208 """This indicates that the resource is already mapped.""" cudaErrorNoKernelImageForDevice = 209 """This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.""" cudaErrorAlreadyAcquired = 210 """This indicates that a resource has already been acquired.""" cudaErrorNotMapped = 211 """This indicates that a resource is not mapped.""" cudaErrorNotMappedAsArray = 212 """This indicates that a mapped resource is not available for access as an array.""" cudaErrorNotMappedAsPointer = 213 """This indicates that a mapped resource is not available for access as a pointer.""" cudaErrorECCUncorrectable = 214 """This indicates that an uncorrectable ECC error was detected during execution.""" cudaErrorUnsupportedLimit = 215 """This indicates that the :class:`cudaLimit` passed to the API call is not supported by the active device.""" cudaErrorDeviceAlreadyInUse = 216 """This indicates that a call tried to access an exclusive-thread device that is already in use by a different thread.""" cudaErrorPeerAccessUnsupported = 217 """This error indicates that P2P access is not supported across the given devices.""" cudaErrorInvalidPtx = 218 """A PTX compilation failed. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.""" cudaErrorInvalidGraphicsContext = 219 """This indicates an error with the OpenGL or DirectX context.""" cudaErrorNvlinkUncorrectable = 220 """This indicates that an uncorrectable NVLink error was detected during the execution.""" cudaErrorJitCompilerNotFound = 221 """This indicates that the PTX JIT compiler library was not found. The JIT Compiler library is used for PTX compilation. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.""" cudaErrorUnsupportedPtxVersion = 222 """This indicates that the provided PTX was compiled with an unsupported toolchain. The most common reason for this, is the PTX was generated by a compiler newer than what is supported by the CUDA driver and PTX JIT compiler.""" cudaErrorJitCompilationDisabled = 223 """This indicates that the JIT compilation was disabled. The JIT compilation compiles PTX. The runtime may fall back to compiling PTX if an application does not contain a suitable binary for the current device.""" cudaErrorUnsupportedExecAffinity = 224 """This indicates that the provided execution affinity is not supported by the device.""" cudaErrorInvalidSource = 300 """This indicates that the device kernel source is invalid.""" cudaErrorFileNotFound = 301 """This indicates that the file specified was not found.""" cudaErrorSharedObjectSymbolNotFound = 302 """This indicates that a link to a shared object failed to resolve.""" cudaErrorSharedObjectInitFailed = 303 """This indicates that initialization of a shared object failed.""" cudaErrorOperatingSystem = 304 """This error indicates that an OS call failed.""" cudaErrorInvalidResourceHandle = 400 """This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like :data:`cudaStream_t` and :data:`cudaEvent_t`.""" cudaErrorIllegalState = 401 """This indicates that a resource required by the API call is not in a valid state to perform the requested operation.""" cudaErrorSymbolNotFound = 500 """This indicates that a named symbol was not found. Examples of symbols are global / constant variable names, driver function names, texture names, and surface names.""" cudaErrorNotReady = 600 """This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than :data:`cudaSuccess` (which indicates completion). Calls that may return this value include :func:`cudaEventQuery` and :func:`cudaStreamQuery`.""" cudaErrorIllegalAddress = 700 """The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorLaunchOutOfResources = 701 """This indicates that a launch did not occur because it did not have appropriate resources. Although this error is similar to :data:`cudaErrorInvalidConfiguration`, this error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count.""" cudaErrorLaunchTimeout = 702 """This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device property kernelExecTimeoutEnabled for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorLaunchIncompatibleTexturing = 703 """This error indicates a kernel launch that uses an incompatible texturing mode.""" cudaErrorPeerAccessAlreadyEnabled = 704 """This error indicates that a call to :func:`cudaDeviceEnablePeerAccess` is trying to re-enable peer addressing on from a context which has already had peer addressing enabled.""" cudaErrorPeerAccessNotEnabled = 705 """This error indicates that :func:`cudaDeviceDisablePeerAccess` is trying to disable peer addressing which has not been enabled yet via :func:`cudaDeviceEnablePeerAccess`.""" cudaErrorSetOnActiveProcess = 708 """This indicates that the user has called :func:`cudaSetValidDevices`, :func:`cudaSetDeviceFlags`, :func:`cudaD3D9SetDirect3DDevice`, :func:`cudaD3D10SetDirect3DDevice`, :func:`cudaD3D11SetDirect3DDevice`, or :func:`cudaVDPAUSetVDPAUDevice` after initializing the CUDA Runtime by calling non-device management operations (allocating memory and launching kernels are examples of non-device management operations). This error can also be returned if using runtime / driver interoperability and there is an existing :class:`CUcontext` active on the host thread.""" cudaErrorContextIsDestroyed = 709 """This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized.""" cudaErrorAssert = 710 """An assert triggered in device code during kernel execution. The device cannot be used again. All existing allocations are invalid. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorTooManyPeers = 711 """This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to :func:`cudaEnablePeerAccess`.""" cudaErrorHostMemoryAlreadyRegistered = 712 """This error indicates that the memory range passed to :func:`cudaHostRegister` has already been registered.""" cudaErrorHostMemoryNotRegistered = 713 """This error indicates that the pointer passed to :func:`cudaHostUnregister` does not correspond to any currently registered memory region.""" cudaErrorHardwareStackError = 714 """Device encountered an error in the call stack during kernel execution, possibly due to stack corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorIllegalInstruction = 715 """The device encountered an illegal instruction during kernel execution This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorMisalignedAddress = 716 """The device encountered a load or store instruction on a memory address which is not aligned. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorInvalidAddressSpace = 717 """While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorInvalidPc = 718 """The device encountered an invalid program counter. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorLaunchFailure = 719 """An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. Less common cases can be system specific - more information about these cases can be found in the system specific user guide. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorCooperativeLaunchTooLarge = 720 """This error indicates that the number of blocks launched per grid for a kernel that was launched via either :func:`cudaLaunchCooperativeKernel` or :func:`cudaLaunchCooperativeKernelMultiDevice` exceeds the maximum number of blocks as allowed by :func:`cudaOccupancyMaxActiveBlocksPerMultiprocessor` or :func:`cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :func:`cudaDevAttrMultiProcessorCount`.""" cudaErrorNotPermitted = 800 """This error indicates the attempted operation is not permitted.""" cudaErrorNotSupported = 801 """This error indicates the attempted operation is not supported on the current system or device.""" cudaErrorSystemNotReady = 802 """This error indicates that the system is not yet ready to start any CUDA work. To continue using CUDA, verify the system configuration is in a valid state and all required driver daemons are actively running. More information about this error can be found in the system specific user guide.""" cudaErrorSystemDriverMismatch = 803 """This error indicates that there is a mismatch between the versions of the display driver and the CUDA driver. Refer to the compatibility documentation for supported versions.""" cudaErrorCompatNotSupportedOnDevice = 804 """This error indicates that the system was upgraded to run with forward compatibility but the visible hardware detected by CUDA does not support this configuration. Refer to the compatibility documentation for the supported hardware matrix or ensure that only supported hardware is visible during initialization via the ``CUDA_VISIBLE_DEVICES`` environment variable.""" cudaErrorMpsConnectionFailed = 805 """This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.""" cudaErrorMpsRpcFailure = 806 """This error indicates that the remote procedural call between the MPS server and the MPS client failed.""" cudaErrorMpsServerNotReady = 807 """This error indicates that the MPS server is not ready to accept new MPS client requests. This error can be returned when the MPS server is in the process of recovering from a fatal failure.""" cudaErrorMpsMaxClientsReached = 808 """This error indicates that the hardware resources required to create MPS client have been exhausted.""" cudaErrorMpsMaxConnectionsReached = 809 """This error indicates the the hardware resources required to device connections have been exhausted.""" cudaErrorMpsClientTerminated = 810 """This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorCdpNotSupported = 811 """This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.""" cudaErrorCdpVersionMismatch = 812 """This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.""" cudaErrorStreamCaptureUnsupported = 900 """The operation is not permitted when the stream is capturing.""" cudaErrorStreamCaptureInvalidated = 901 """The current capture sequence on the stream has been invalidated due to a previous error.""" cudaErrorStreamCaptureMerge = 902 """The operation would have resulted in a merge of two independent capture sequences.""" cudaErrorStreamCaptureUnmatched = 903 """The capture was not initiated in this stream.""" cudaErrorStreamCaptureUnjoined = 904 """The capture sequence contains a fork that was not joined to the primary stream.""" cudaErrorStreamCaptureIsolation = 905 """A dependency would have been created which crosses the capture sequence boundary. Only implicit in-stream ordering dependencies are allowed to cross the boundary.""" cudaErrorStreamCaptureImplicit = 906 """The operation would have resulted in a disallowed implicit dependency on a current capture sequence from :data:`cudaStreamLegacy`.""" cudaErrorCapturedEvent = 907 """The operation is not permitted on an event which was last recorded in a capturing stream.""" cudaErrorStreamCaptureWrongThread = 908 """A stream capture sequence not initiated with the :data:`cudaStreamCaptureModeRelaxed` argument to :func:`cudaStreamBeginCapture` was passed to :func:`cudaStreamEndCapture` in a different thread.""" cudaErrorTimeout = 909 """This indicates that the wait operation has timed out.""" cudaErrorGraphExecUpdateFailure = 910 """This error indicates that the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.""" cudaErrorExternalDevice = 911 """This indicates that an async error has occurred in a device outside of CUDA. If CUDA was waiting for an external device's signal before consuming shared data, the external device signaled an error indicating that the data is not valid for consumption. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.""" cudaErrorInvalidClusterSize = 912 """This indicates that a kernel launch error has occurred due to cluster misconfiguration.""" cudaErrorUnknown = 999 """This indicates that an unknown internal error has occurred.""" # pylint: enable=line-too-long # Error Checking # class cudaError(Exception): """Base exception class for CUDA driver query errors.""" _value_class_mapping: _ClassVar[dict[int, type[cudaError]]] = {} _errcode_to_string: _ClassVar[dict[int, str]] = { # List of currently known error codes cudaErrorInitializationError: 'Initialization error.', cudaErrorSymbolNotFound: 'Named symbol not found.', cudaErrorInvalidValue: 'Invalid argument.', cudaErrorNoDevice: 'No CUDA-capable device is detected.', cudaErrorInvalidDevice: 'Invalid device ordinal.', cudaErrorSystemDriverMismatch: 'System has unsupported display driver / CUDA driver combination.', cudaErrorCudartUnloading: 'Driver shutting down.', cudaErrorCompatNotSupportedOnDevice: 'Forward compatibility was attempted on non supported Hardware.', cudaErrorDeviceUninitialized: 'Invalid device context.', } # fmt:skip _errcode_to_name: _ClassVar[dict[int, str]] = {} value: int def __new__(cls, value: int) -> _Self: """Map value to a proper subclass of :class:`cudaError`.""" if cls is cudaError: # pylint: disable-next=self-cls-assignment cls = cudaError._value_class_mapping.get(value, cls) # type: ignore[assignment] obj = Exception.__new__(cls) obj.value = value return obj def __repr__(self) -> str: """Return a string representation of the error.""" # pylint: disable=no-member try: if self.value not in cudaError._errcode_to_string: cudaError._errcode_to_string[self.value] = '{}.'.format( cuGetErrorString(self.value).rstrip('.').capitalize(), ) if self.value not in cudaError._errcode_to_name: cudaError._errcode_to_name[self.value] = cudaGetErrorName(self.value) return ( f'{cudaError._errcode_to_string[self.value]} ' f'Code: {cudaError._errcode_to_name[self.value]} ({self.value}).' ) except cudaError: return f'CUDA Error with code {self.value}.' def __eq__(self, other: object) -> bool: """Test equality to other object.""" if not isinstance(other, cudaError): return NotImplemented return self.value == other.value # pylint: disable=no-member def __reduce__(self) -> tuple[type[cudaError], tuple[int]]: """Return state information for pickling.""" return cudaError, (self.value,) # pylint: disable=no-member def cudaExceptionClass(cudaErrorCode: int) -> type[cudaError]: """Map value to a proper subclass of :class:`cudaError`. Raises: ValueError: If the error code is not valid. """ if cudaErrorCode not in cudaError._value_class_mapping: # pylint: disable=protected-access raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.') return cudaError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access def _extract_cuda_errors_as_classes() -> None: """Generate a hierarchy of classes on top of :class:`cudaError` class. Each CUDA Error gets a new :class:`cudaError` subclass. This way try-except blocks can filter appropriate exceptions more easily. :class:`cudaError` is a parent class. Each ``cudaError*`` gets it's own subclass. e.g. :data:`cudaErrorInvalidValue` will be turned into :class:`cudaError_InvalidValue`. """ this_module = _sys.modules[__name__] cuda_error_names = [ x for x in dir(this_module) if x.startswith('cudaError') and not x.startswith('cudaError_') and x != 'cudaError' ] for err_name in cuda_error_names: # e.g. Turn cudaErrorInvalidValue into cudaError_InvalidValue class_name = err_name.replace('cudaError', 'cudaError_') err_val = getattr(this_module, err_name) def gen_new(value: int) -> _Callable[[type[cudaError]], cudaError]: def new(cls: type[cudaError]) -> cudaError: return cudaError.__new__(cls, value) return new # pylint: disable=protected-access new_error_class = type(class_name, (cudaError,), {'__new__': gen_new(err_val)}) new_error_class.__module__ = __name__ if err_val in cudaError._errcode_to_string: new_error_class.__doc__ = ( f'cudaError: {cudaError._errcode_to_string[err_val]} ' f'Code: :data:`{err_name}` ({err_val}).' ) else: new_error_class.__doc__ = f'CUDA Error with code :data:`{err_name}` ({err_val})' setattr(this_module, class_name, new_error_class) cudaError._value_class_mapping[err_val] = new_error_class cudaError._errcode_to_name[err_val] = err_name # Add explicit references to appease linters class __cudaError(cudaError): value: int def __new__(cls) -> cudaError: # type: ignore[misc,empty-body] ... cudaError_InitializationError: type[__cudaError] cudaError_SymbolNotFound: type[__cudaError] cudaError_InvalidValue: type[__cudaError] cudaError_NoDevice: type[__cudaError] cudaError_InvalidDevice: type[__cudaError] cudaError_SystemDriverMismatch: type[__cudaError] cudaError_CudartUnloading: type[__cudaError] cudaError_CompatNotSupportedOnDevice: type[__cudaError] cudaError_DeviceUninitialized: type[__cudaError] _extract_cuda_errors_as_classes() del _extract_cuda_errors_as_classes def _cudaCheckReturn(ret: _Any) -> _Any: if ret != cudaSuccess: raise cudaError(ret) return ret # Function access # __cudaLib: _ctypes.CDLL | None = None __libLoadLock: _threading.Lock = _threading.Lock() # Function pointers are cached to prevent unnecessary libLoadLock locking __cudaGetFunctionPointer_cache: dict[str, _ctypes._CFuncPtr] = {} # type: ignore[name-defined] def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined] """Get the function pointer from the CUDA Runtime library. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_SymbolNotFound: If cannot found the function pointer. """ if name in __cudaGetFunctionPointer_cache: return __cudaGetFunctionPointer_cache[name] if __cudaLib is None: __LoadCudaLibrary() with __libLoadLock: try: __cudaGetFunctionPointer_cache[name] = getattr(__cudaLib, name) return __cudaGetFunctionPointer_cache[name] except AttributeError as ex: raise cudaError(cudaErrorSymbolNotFound) from ex def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches """Load the library if it isn't loaded already. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ global __cudaLib # pylint: disable=global-statement if __cudaLib is None: # Lock to ensure only one caller loads the library with __libLoadLock: # Ensure the library still isn't loaded if __cudaLib is None: # pylint: disable=too-many-nested-blocks # Platform specific libcudart location system = _platform.system() bits = 8 * _ctypes.sizeof(_ctypes.c_void_p) # 64 or 32 if system == 'Darwin': lib_filenames = ['libcudart.dylib'] elif system == 'Linux': lib_filenames = ['libcudart.so'] elif system == 'Windows': lib_filenames = [f'cudart{bits}.dll', 'cudart.dll'] else: lib_filenames = [] # Open library for lib_filename in lib_filenames: try: __cudaLib = _ctypes.CDLL(lib_filename) break except OSError: pass # Try to load the library from the CUDA_PATH environment variable if __cudaLib is None: cuda_paths = [ _os.getenv(env_name, '') for env_name in ('CUDA_PATH', 'CUDA_HOME', 'CUDA_ROOT') ] if system != 'Windows': cuda_paths.append('/usr/local/cuda') candidate_paths = [] for cuda_path in cuda_paths: if _os.path.isdir(cuda_path): for lib_filename in lib_filenames: candidate_paths.extend( [ _os.path.join(cuda_path, f'lib{bits}', lib_filename), _os.path.join(cuda_path, 'lib', lib_filename), ], ) else: candidate_dirs = _os.getenv('PATH', '').split(_os.path.pathsep) candidate_paths = [] for cuda_path in cuda_paths: if _os.path.isdir(cuda_path): candidate_dirs.extend( [ _os.path.join(cuda_path, 'bin'), _os.path.join(cuda_path, f'lib{bits}'), _os.path.join(cuda_path, 'lib'), ], ) for candidate_dir in candidate_dirs: candidate_paths.extend( _glob.iglob(_os.path.join(candidate_dir, f'cudart{bits}*.dll')), ) # Normalize paths and remove duplicates candidate_paths = list( dict.fromkeys( _os.path.normpath(_os.path.normcase(p)) for p in candidate_paths ), ) for lib_filename in candidate_paths: try: __cudaLib = _ctypes.CDLL(lib_filename) break except OSError: pass if __cudaLib is None: _cudaCheckReturn(cudaErrorInitializationError) def cudaGetErrorName(error: int) -> str: """Get the string representation of an error code enum name. Returns: str A string containing the name of an error code in the enum. If the error code is not recognized, "unrecognized error code" is returned. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ fn = __cudaGetFunctionPointer('cudaGetErrorName') fn.restype = _ctypes.c_char_p # otherwise return is an int p_name = fn(_cudaError_t(error)) name = _ctypes.string_at(p_name) return name.decode('utf-8', errors='replace') def cuGetErrorString(error: int) -> str: """Get the description string for an error code. Returns: str The description string for an error code. If the error code is not recognized, "unrecognized error code" is returned. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ fn = __cudaGetFunctionPointer('cudaGetErrorString') fn.restype = _ctypes.c_char_p # otherwise return is an int p_name = fn(_cudaError_t(error)) name = _ctypes.string_at(p_name) return name.decode('utf-8', errors='replace') def cudaGetLastError() -> int: """Get the last error from a runtime call. Returns: int The last error that has been produced by any of the runtime calls in the same instance of the CUDA Runtime library in the host thread and resets it to :data:`cudaSuccess`. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ fn = __cudaGetFunctionPointer('cudaGetLastError') return fn() def cudaPeekAtLastError() -> int: """Get the last error from a runtime call. Returns: int The last error that has been produced by any of the runtime calls in the same instance of the CUDA Runtime library in the host thread. This call does not reset the error to :data:`cudaSuccess` like :func:`cudaGetLastError`. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ fn = __cudaGetFunctionPointer('cudaPeekAtLastError') return fn() def cudaDriverGetVersion() -> str: """Get the latest CUDA version supported by driver. Returns: str The latest version of CUDA supported by the driver of the form :data:`'.'`. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ fn = __cudaGetFunctionPointer('cudaDriverGetVersion') driver_version = _ctypes.c_int() ret = fn(_ctypes.byref(driver_version)) _cudaCheckReturn(ret) major = driver_version.value // 1000 minor = (driver_version.value % 1000) // 10 return f'{major}.{minor}' def cudaRuntimeGetVersion() -> str: """Get the CUDA Runtime version. Returns: str The version number of the current CUDA Runtime instance of the form :data:`'.'`. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ fn = __cudaGetFunctionPointer('cudaRuntimeGetVersion') runtime_version = _ctypes.c_int() ret = fn(_ctypes.byref(runtime_version)) _cudaCheckReturn(ret) major = runtime_version.value // 1000 minor = (runtime_version.value % 1000) // 10 return f'{major}.{minor}' def cudaGetDeviceCount() -> int: """Get the number of compute-capable devices. Returns: int The number of devices with compute capability greater or equal to 2.0 that are available for execution. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ fn = __cudaGetFunctionPointer('cudaGetDeviceCount') count = _ctypes.c_int(0) ret = fn(_ctypes.byref(count)) _cudaCheckReturn(ret) return count.value def cudaDeviceGetByPCIBusId(pciBusId: str) -> int: """Get a handle to a compute device. Args: pciBusId (str): String in one of the following forms: ``[domain]:[bus]:[device].[function]``, ``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``, ``device``, and ``function`` are all hexadecimal values. Returns: int A device ordinal given a PCI bus ID string. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. cudaError_InvalidValue: If the value of :data:`pciBusId` is not a valid PCI bus identifier. cudaError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device. """ fn = __cudaGetFunctionPointer('cudaDeviceGetByPCIBusId') device = _ctypes.c_int() ret = fn(_ctypes.byref(device), _ctypes.c_char_p(pciBusId.encode('utf-8'))) _cudaCheckReturn(ret) return device.value def cudaDeviceGetPCIBusId(device: int) -> str: """Get a PCI Bus Id string for the device. Returns: str An ASCII string identifying the device. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. cudaError_InsufficientDriver: If the installed NVIDIA CUDA driver is older than the CUDA Runtime library. cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. cudaError_InvalidValue: If the value of :data:`device` is not a valid device ordinal. cudaError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device. """ fn = __cudaGetFunctionPointer('cudaDeviceGetPCIBusId') pciBusId = _ctypes.create_string_buffer(256) ret = fn(pciBusId, _ctypes.c_int(256), _ctypes.c_int(device)) _cudaCheckReturn(ret) return pciBusId.value.decode('utf-8', errors='replace') def is_available() -> bool: """Test whether there are any CUDA visible devices.""" try: return cudaGetDeviceCount() > 0 except cudaError: return False nvitop-1.4.2/nvitop/api/libnvml.py000066400000000000000000001150441474547113600171630ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utilities for the NVML Python bindings (`nvidia-ml-py `_).""" # pylint: disable=invalid-name from __future__ import annotations import atexit as _atexit import ctypes as _ctypes import inspect as _inspect import logging as _logging import os as _os import re as _re import sys as _sys import threading as _threading import time as _time from types import FunctionType as _FunctionType from types import ModuleType as _ModuleType from typing import TYPE_CHECKING as _TYPE_CHECKING from typing import Any as _Any from typing import ClassVar as _ClassVar # Python Bindings for the NVIDIA Management Library (NVML) # https://pypi.org/project/nvidia-ml-py import pynvml as _pynvml from pynvml import * # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import from pynvml import nvmlDeviceGetPciInfo # appease mypy # noqa: F401 # pylint: disable=unused-import from nvitop.api.utils import NA, UINT_MAX, ULONGLONG_MAX, NaType from nvitop.api.utils import colored as __colored if _TYPE_CHECKING: from collections.abc import Callable as _Callable from typing_extensions import Self as _Self # Python 3.11+ from typing_extensions import TypeAlias as _TypeAlias # Python 3.10+ __all__ = [ # will be updated in below 'NA', 'UINT_MAX', 'ULONGLONG_MAX', 'nvmlCheckReturn', 'nvmlQuery', 'nvmlQueryFieldValues', 'nvmlInit', 'nvmlInitWithFlags', 'nvmlShutdown', 'NVMLError', ] if not callable(getattr(_pynvml, 'nvmlInitWithFlags', None)): raise ImportError( # noqa: TRY004 'Your installed package `nvidia-ml-py` is corrupted. Please reinstall package ' '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.', ) # Members from `pynvml` ############################################################################ NVMLError: type[_pynvml.NVMLError] = _pynvml.NVMLError NVMLError.__doc__ = """Base exception class for NVML query errors.""" NVMLError.__new__.__doc__ = """Map value to a proper subclass of :class:`NVMLError`.""" nvmlExceptionClass: _Callable[[int], type[_pynvml.NVMLError]] = _pynvml.nvmlExceptionClass nvmlExceptionClass.__doc__ = """Map value to a proper subclass of :class:`NVMLError`.""" # Load members from module `pynvml` and register them in `__all__` and globals. _vars_pynvml = vars(_pynvml) _name = _attr = None _errcode_to_name = {} _const_names = [] _errcode_to_string = NVMLError._errcode_to_string # pylint: disable=protected-access # 1. Put error classes in `__all__` first for _name, _attr in _vars_pynvml.items(): if _name in {'nvmlInit', 'nvmlInitWithFlags', 'nvmlShutdown'}: continue if _name.startswith(('NVML_ERROR_', 'NVMLError_')): __all__.append(_name) # noqa: PYI056 if _name.startswith('NVML_ERROR_'): _errcode_to_name[_attr] = _name _const_names.append(_name) # 2. Then the remaining members for _name, _attr in _vars_pynvml.items(): if _name in {'nvmlInit', 'nvmlInitWithFlags', 'nvmlShutdown'}: continue if (_name.startswith('NVML_') and not _name.startswith('NVML_ERROR_')) or ( _name.startswith('nvml') and isinstance(_attr, _FunctionType) ): __all__.append(_name) # noqa: PYI056 if _name.startswith('NVML_'): _const_names.append(_name) # 3. Add docstring to exception classes _errcode = _reason = _subclass = None for _errcode, _reason in _errcode_to_string.items(): _subclass = nvmlExceptionClass(_errcode) _subclass.__doc__ = '{}. Code: :data:`{}` ({})'.format( _reason.rstrip('.'), _errcode_to_name[_errcode], _errcode, ) # 4. Add undocumented constants into module docstring _data_docs = [] _sphinx_doc = None for _name in _const_names: _attr = _vars_pynvml[_name] _sphinx_doc = f""" .. data:: {_name} :type: {_attr.__class__.__name__} :value: {_attr!r} """ if _name.startswith('NVML_ERROR_') and _attr in _errcode_to_string: _reason = _errcode_to_string[_attr] _sphinx_doc += """ {}. See also class :class:`NVMLError` and :class:`{}`. """.format(_reason.rstrip('.'), nvmlExceptionClass(_attr).__name__) # fmt: skip _data_docs.append(_sphinx_doc.strip()) __doc__ += """ --------- Constants ^^^^^^^^^ {} --------- Functions and Exceptions ^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: __enter__() -> libnvml Entry of the context manager for ``with`` statement. .. function:: __exit__(*args, **kwargs) -> None Shutdown the NVML context in the context manager for ``with`` statement. """.format('\n\n'.join(_data_docs)) # fmt: skip del ( _name, _attr, _vars_pynvml, _errcode, _reason, _subclass, _errcode_to_name, _errcode_to_string, _const_names, _data_docs, _sphinx_doc, ) # 5. Add explicit references to appease linters # pylint: disable=no-member c_nvmlDevice_t: _TypeAlias = _pynvml.c_nvmlDevice_t # noqa: PYI042 c_nvmlFieldValue_t: _TypeAlias = _pynvml.c_nvmlFieldValue_t # noqa: PYI042 NVML_SUCCESS: int = _pynvml.NVML_SUCCESS NVML_ERROR_INSUFFICIENT_SIZE: int = _pynvml.NVML_ERROR_INSUFFICIENT_SIZE NVMLError_FunctionNotFound: _TypeAlias = _pynvml.NVMLError_FunctionNotFound NVMLError_GpuIsLost: _TypeAlias = _pynvml.NVMLError_GpuIsLost NVMLError_InvalidArgument: _TypeAlias = _pynvml.NVMLError_InvalidArgument NVMLError_LibraryNotFound: _TypeAlias = _pynvml.NVMLError_LibraryNotFound NVMLError_NoPermission: _TypeAlias = _pynvml.NVMLError_NoPermission NVMLError_NotFound: _TypeAlias = _pynvml.NVMLError_NotFound NVMLError_NotSupported: _TypeAlias = _pynvml.NVMLError_NotSupported NVMLError_Unknown: _TypeAlias = _pynvml.NVMLError_Unknown NVML_CLOCK_GRAPHICS: int = _pynvml.NVML_CLOCK_GRAPHICS NVML_CLOCK_SM: int = _pynvml.NVML_CLOCK_SM NVML_CLOCK_MEM: int = _pynvml.NVML_CLOCK_MEM NVML_CLOCK_VIDEO: int = _pynvml.NVML_CLOCK_VIDEO NVML_TEMPERATURE_GPU: int = _pynvml.NVML_TEMPERATURE_GPU NVML_DRIVER_WDDM: int = _pynvml.NVML_DRIVER_WDDM NVML_DRIVER_WDM: int = _pynvml.NVML_DRIVER_WDM NVML_MEMORY_ERROR_TYPE_UNCORRECTED: int = _pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED NVML_VOLATILE_ECC: int = _pynvml.NVML_VOLATILE_ECC NVML_COMPUTEMODE_DEFAULT: int = _pynvml.NVML_COMPUTEMODE_DEFAULT NVML_COMPUTEMODE_EXCLUSIVE_THREAD: int = _pynvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD NVML_COMPUTEMODE_PROHIBITED: int = _pynvml.NVML_COMPUTEMODE_PROHIBITED NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: int = _pynvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS NVML_PCIE_UTIL_TX_BYTES: int = _pynvml.NVML_PCIE_UTIL_TX_BYTES NVML_PCIE_UTIL_RX_BYTES: int = _pynvml.NVML_PCIE_UTIL_RX_BYTES NVML_NVLINK_MAX_LINKS: int = _pynvml.NVML_NVLINK_MAX_LINKS NVML_FI_DEV_NVLINK_LINK_COUNT: int = _pynvml.NVML_FI_DEV_NVLINK_LINK_COUNT NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX NVML_VALUE_TYPE_DOUBLE: int = getattr(_pynvml, 'NVML_VALUE_TYPE_DOUBLE', 0) NVML_VALUE_TYPE_UNSIGNED_INT: int = getattr(_pynvml, 'NVML_VALUE_TYPE_UNSIGNED_INT', 1) NVML_VALUE_TYPE_UNSIGNED_LONG: int = getattr(_pynvml, 'NVML_VALUE_TYPE_UNSIGNED_LONG', 2) NVML_VALUE_TYPE_UNSIGNED_LONG_LONG: int = getattr(_pynvml, 'NVML_VALUE_TYPE_UNSIGNED_LONG_LONG', 3) NVML_VALUE_TYPE_SIGNED_LONG_LONG: int = getattr(_pynvml, 'NVML_VALUE_TYPE_SIGNED_LONG_LONG', 4) NVML_VALUE_TYPE_SIGNED_INT: int = getattr(_pynvml, 'NVML_VALUE_TYPE_SIGNED_INT', 5) # pylint: enable=no-member # New members in `libnvml` ######################################################################### __flags: list[int] = [] __initialized: bool = False __lock: _threading.Lock = _threading.Lock() LOGGER: _logging.Logger = _logging.getLogger(__name__) try: LOGGER.setLevel(_os.getenv('LOGLEVEL', default='WARNING').upper()) except (ValueError, TypeError): pass if not LOGGER.hasHandlers() and LOGGER.isEnabledFor(_logging.DEBUG): _formatter = _logging.Formatter( '[%(levelname)s] %(asctime)s %(name)s::%(funcName)s: %(message)s', ) _stream_handler = _logging.StreamHandler() _stream_handler.setFormatter(_formatter) _file_handler = _logging.FileHandler('nvitop.log') _file_handler.setFormatter(_formatter) LOGGER.addHandler(_stream_handler) LOGGER.addHandler(_file_handler) del _formatter, _stream_handler, _file_handler UNKNOWN_FUNCTIONS: dict[str, tuple[_Callable | str, NVMLError_FunctionNotFound]] = {} UNKNOWN_FUNCTIONS_CACHE_SIZE: int = 1024 VERSIONED_PATTERN: _re.Pattern = _re.compile(r'^(?P\w+)(?P_v(\d)+)$') def _lazy_init() -> None: """Lazily initialize the NVML context. Raises: NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. AttributeError: If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ with __lock: if __initialized: return nvmlInit() _atexit.register(nvmlShutdown) def nvmlInit() -> None: # pylint: disable=function-redefined """Initialize the NVML context with default flag (0). Raises: NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. AttributeError: If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ nvmlInitWithFlags(0) def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined """Initialize the NVML context with the given flags. Raises: NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. AttributeError: If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned with __lock: if len(__flags) > 0 and flags == __flags[-1]: __initialized = True return try: _pynvml.nvmlInitWithFlags(flags) except NVMLError_LibraryNotFound: message = ( 'FATAL ERROR: NVIDIA Management Library (NVML) not found.\n' 'HINT: The NVIDIA Management Library ships with the NVIDIA display driver (available at\n' ' https://www.nvidia.com/Download/index.aspx), or can be downloaded as part of the\n' ' NVIDIA CUDA Toolkit (available at https://developer.nvidia.com/cuda-downloads).\n' ' The lists of OS platforms and NVIDIA-GPUs supported by the NVML library can be\n' ' found in the NVML API Reference at https://docs.nvidia.com/deploy/nvml-api.' ) for text, color, attrs in ( ('FATAL ERROR:', 'red', ('bold',)), ('HINT:', 'yellow', ('bold',)), ('https://www.nvidia.com/Download/index.aspx', None, ('underline',)), ('https://developer.nvidia.com/cuda-downloads', None, ('underline',)), ('https://docs.nvidia.com/deploy/nvml-api', None, ('underline',)), ): message = message.replace(text, __colored(text, color=color, attrs=attrs)) # type: ignore[arg-type] LOGGER.critical(message) raise except AttributeError: message = ( 'FATAL ERROR: The dependency package `nvidia-ml-py` is corrupted. You may have installed\n' ' other packages overriding the module `pynvml`.\n' 'Please reinstall `nvitop` with command:\n' ' python3 -m pip install --force-reinstall nvitop' ) for text, color, attrs in ( ('FATAL ERROR:', 'red', ('bold',)), ('nvidia-ml-py', None, ('bold',)), ('pynvml', None, ('bold',)), ('nvitop', None, ('bold',)), ): message = message.replace(text, __colored(text, color=color, attrs=attrs), 1) # type: ignore[arg-type] LOGGER.critical(message) raise with __lock: __flags.append(flags) __initialized = True def nvmlShutdown() -> None: # pylint: disable=function-redefined """Shutdown the NVML context. Raises: NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. """ global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned _pynvml.nvmlShutdown() with __lock: try: __flags.pop() except IndexError: pass __initialized = len(__flags) > 0 def nvmlQuery( func: _Callable[..., _Any] | str, *args: _Any, default: _Any = NA, ignore_errors: bool = True, ignore_function_not_found: bool = False, **kwargs: _Any, ) -> _Any: """Call a function with the given arguments from NVML. The NVML context will be automatically initialized. Args: func (Union[Callable[..., Any], str]): The function to call. If it is given by string, lookup for the function first from module :mod:`pynvml`. default (Any): The default value if the query fails. ignore_errors (bool): Whether to ignore errors and return the default value. ignore_function_not_found (bool): Whether to ignore function not found errors and return the default value. If set to :data:`False`, an error message will be logged to the logger. *args: Positional arguments to pass to the query function. **kwargs: Keyword arguments to pass to the query function. Raises: NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. NVMLError_FunctionNotFound: If the function is not found, usually the installed ``nvidia-ml-py`` is not compatible with the installed NVIDIA driver. NVMLError_NotSupported: If the function is not supported by the driver or the device. NVMLError_InvalidArgument: If passed with an invalid argument. """ global UNKNOWN_FUNCTIONS # pylint: disable=global-statement,global-variable-not-assigned _lazy_init() try: if isinstance(func, str): try: func = getattr(__modself, func) except AttributeError as e1: raise NVMLError_FunctionNotFound from e1 try: retval = func(*args, **kwargs) # type: ignore[operator] except UnicodeDecodeError as e2: raise NVMLError_Unknown from e2 except NVMLError_FunctionNotFound as e3: if not ignore_function_not_found: identifier = ( func if isinstance(func, str) else (_inspect.getsource(func) if func.__name__ == '' else repr(func)) ) with __lock: if ( identifier not in UNKNOWN_FUNCTIONS and len(UNKNOWN_FUNCTIONS) < UNKNOWN_FUNCTIONS_CACHE_SIZE ): UNKNOWN_FUNCTIONS[identifier] = (func, e3) LOGGER.exception( ( 'ERROR: A FunctionNotFound error occurred while calling %s.\n' 'Please verify whether the `nvidia-ml-py` package is ' 'compatible with your NVIDIA driver version.' ), f'nvmlQuery({func!r}, *args, **kwargs)', ) if ignore_errors or ignore_function_not_found: return default raise except NVMLError: if ignore_errors: return default raise if isinstance(retval, bytes): retval = retval.decode('utf-8', errors='replace') return retval def nvmlQueryFieldValues( handle: c_nvmlDevice_t, field_ids: list[int | tuple[int, int]], ) -> list[tuple[float | int | NaType, int]]: """Query multiple field values from NVML. Request values for a list of fields for a device. This API allows multiple fields to be queried at once. If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs will be populated from a single call rather than making a driver call for each fieldId. Raises: NVMLError_InvalidArgument: If device or field_ids is invalid. """ field_values = nvmlQuery('nvmlDeviceGetFieldValues', handle, field_ids) if not nvmlCheckReturn(field_values): timestamp = _time.time_ns() // 1000 return [(NA, timestamp) for _ in range(len(field_ids))] values_with_timestamps: list[tuple[float | int | NaType, int]] = [] for field_value in field_values: timestamp = field_value.timestamp if field_value.nvmlReturn != NVML_SUCCESS: value = NA timestamp = _time.time_ns() // 1000 elif field_value.valueType == NVML_VALUE_TYPE_DOUBLE: value = field_value.value.dVal elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT: value = field_value.value.uiVal elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG: value = field_value.value.ulVal elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG: value = field_value.value.ullVal elif field_value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG: value = field_value.value.llVal elif field_value.valueType == NVML_VALUE_TYPE_SIGNED_INT: value = field_value.value.iVal else: value = NA values_with_timestamps.append((value, timestamp)) return values_with_timestamps def nvmlCheckReturn(retval: _Any, types: type | tuple[type, ...] | None = None) -> bool: """Check whether the return value is not :const:`nvitop.NA` and is one of the given types.""" if types is None: return retval != NA return retval != NA and isinstance(retval, types) # Patch layers for backward compatibility ########################################################## _pynvml_installation_corrupted: bool = not callable( getattr(_pynvml, '_nvmlGetFunctionPointer', None), ) # Patch function `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses` if not _pynvml_installation_corrupted: # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access _fields_: _ClassVar[list[tuple[str, type]]] = [ # Process ID ('pid', _ctypes.c_uint), # Amount of used GPU memory in bytes. # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages # all the memory and not the NVIDIA driver. ('usedGpuMemory', _ctypes.c_ulonglong), ] _fmt_: _ClassVar[dict[str, str]] = { 'usedGpuMemory': '%d B', } # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access _fields_: _ClassVar[list[tuple[str, type]]] = [ # Process ID ('pid', _ctypes.c_uint), # Amount of used GPU memory in bytes. # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages # all the memory and not the NVIDIA driver. ('usedGpuMemory', _ctypes.c_ulonglong), # If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF # otherwise. ('gpuInstanceId', _ctypes.c_uint), # If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to # 0xFFFFFFFF otherwise. ('computeInstanceId', _ctypes.c_uint), ] _fmt_: _ClassVar[dict[str, str]] = { 'usedGpuMemory': '%d B', } # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined class c_nvmlProcessInfo_v3_t(_pynvml._PrintableStructure): # pylint: disable=protected-access _fields_: _ClassVar[list[tuple[str, type]]] = [ # Process ID ('pid', _ctypes.c_uint), # Amount of used GPU memory in bytes. # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages # all the memory and not the NVIDIA driver. ('usedGpuMemory', _ctypes.c_ulonglong), # If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF # otherwise. ('gpuInstanceId', _ctypes.c_uint), # If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to # 0xFFFFFFFF otherwise. ('computeInstanceId', _ctypes.c_uint), # Amount of used GPU conf compute protected memory in bytes. ('usedGpuCcProtectedMemory', _ctypes.c_ulonglong), ] _fmt_: _ClassVar[dict[str, str]] = { 'usedGpuMemory': '%d B', 'usedGpuCcProtectedMemory': '%d B', } __get_running_processes_version_suffix = None c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t def __determine_get_running_processes_version_suffix() -> str: global __get_running_processes_version_suffix, c_nvmlProcessInfo_t # pylint: disable=global-statement if __get_running_processes_version_suffix is None: # pylint: disable-next=protected-access,no-member nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer __get_running_processes_version_suffix = '_v3' def lookup(symbol: str) -> _Any | None: try: ptr = nvmlGetFunctionPointer(symbol) except NVMLError_FunctionNotFound: LOGGER.debug('Failed to found symbol `%s`.', symbol) return None LOGGER.debug('Found symbol `%s`.', symbol) return ptr if lookup('nvmlDeviceGetComputeRunningProcesses_v3'): if lookup('nvmlDeviceGetConfComputeMemSizeInfo') and not lookup( 'nvmlDeviceGetRunningProcessDetailList', ): LOGGER.debug( 'NVML get running process version 3 API with v3 type struct is available.', ) else: c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t LOGGER.debug( 'NVML get running process version 3 API with v3 type struct is not ' 'available due to incompatible NVIDIA driver. Fallback to use get running ' 'process version 3 API with v2 type struct.', ) else: c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t __get_running_processes_version_suffix = '_v2' LOGGER.debug( 'NVML get running process version 3 API with v3 type struct is not available ' 'due to incompatible NVIDIA driver. Fallback to use get running process ' 'version 2 API with v2 type struct.', ) if lookup('nvmlDeviceGetComputeRunningProcesses_v2'): LOGGER.debug( 'NVML get running process version 2 API with v2 type struct is available.', ) else: c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t __get_running_processes_version_suffix = '' LOGGER.debug( 'NVML get running process version 2 API with v2 type struct is not ' 'available due to incompatible NVIDIA driver. Fallback to use get ' 'running process version 1 API with v1 type struct.', ) return __get_running_processes_version_suffix def __nvml_device_get_running_processes( func: str, handle: c_nvmlDevice_t, ) -> list[c_nvmlProcessInfo_t]: """Helper function for :func:`nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`. Modified from function :func:`pynvml.nvmlDeviceGetComputeRunningProcesses` in package `nvidia-ml-py `_. """ version_suffix = __determine_get_running_processes_version_suffix() # First call to get the size c_count = _ctypes.c_uint(0) # pylint: disable-next=protected-access fn = _pynvml._nvmlGetFunctionPointer(f'{func}{version_suffix}') ret = fn(handle, _ctypes.byref(c_count), None) if ret == NVML_SUCCESS: # Special case, no running processes return [] if ret == NVML_ERROR_INSUFFICIENT_SIZE: # Typical case # Oversize the array in case more processes are created c_count.value = c_count.value * 2 + 5 process_array = c_nvmlProcessInfo_t * c_count.value # type: ignore[operator] c_processes = process_array() # type: ignore[operator] # Make the call again ret = fn(handle, _ctypes.byref(c_count), c_processes) _pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access processes = [] for i in range(c_count.value): # Use an alternative struct for this object obj = _pynvml.nvmlStructToFriendlyObject(c_processes[i]) if obj.usedGpuMemory == ULONGLONG_MAX: # Special case for WDDM on Windows, see comment above obj.usedGpuMemory = None processes.append(obj) return processes # Error case raise NVMLError(ret) def nvmlDeviceGetComputeRunningProcesses( # pylint: disable=function-redefined handle: c_nvmlDevice_t, ) -> list[c_nvmlProcessInfo_t]: """Get information about processes with a compute context on a device. Note: - In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. Per-instance information can be queried by using specific MIG device handles. Raises: NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. NVMLError_NoPermission: If the user doesn't have permission to perform this operation. NVMLError_InvalidArgument: If device is invalid. NVMLError_GpuIsLost: If the target GPU has fallen off the bus or is otherwise inaccessible. NVMLError_Unknown: On any unexpected error. """ return __nvml_device_get_running_processes('nvmlDeviceGetComputeRunningProcesses', handle) def nvmlDeviceGetGraphicsRunningProcesses( # pylint: disable=function-redefined handle: c_nvmlDevice_t, ) -> list[c_nvmlProcessInfo_t]: """Get information about processes with a graphics context on a device. Note: - In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. Per-instance information can be queried by using specific MIG device handles. Raises: NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. NVMLError_NoPermission: If the user doesn't have permission to perform this operation. NVMLError_InvalidArgument: If device is invalid. NVMLError_GpuIsLost: If the target GPU has fallen off the bus or is otherwise inaccessible. NVMLError_Unknown: On any unexpected error. """ return __nvml_device_get_running_processes('nvmlDeviceGetGraphicsRunningProcesses', handle) def nvmlDeviceGetMPSComputeRunningProcesses( # pylint: disable=function-redefined handle: c_nvmlDevice_t, ) -> list[c_nvmlProcessInfo_t]: """Get information about processes with a MPS compute context on a device. Note: - In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. Per-instance information can be queried by using specific MIG device handles. Raises: NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. NVMLError_NoPermission: If the user doesn't have permission to perform this operation. NVMLError_InvalidArgument: If device is invalid. NVMLError_GpuIsLost: If the target GPU has fallen off the bus or is otherwise inaccessible. NVMLError_Unknown: On any unexpected error. """ return __nvml_device_get_running_processes( 'nvmlDeviceGetMPSComputeRunningProcesses', handle, ) else: LOGGER.warning( 'Your installed package `nvidia-ml-py` is corrupted. ' 'Skip patch functions `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`. ' 'You may get incorrect or incomplete results. Please consider reinstall package ' '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.', ) # Patch function `nvmlDeviceGetMemoryInfo` if not _pynvml_installation_corrupted: # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined class c_nvmlMemory_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access _fields_: _ClassVar[list[tuple[str, type]]] = [ # Total physical device memory (in bytes). ('total', _ctypes.c_ulonglong), # Unallocated device memory (in bytes). ('free', _ctypes.c_ulonglong), # Allocated device memory (in bytes). # Note that the driver/GPU always sets aside a small amount of memory for bookkeeping. ('used', _ctypes.c_ulonglong), ] _fmt_: _ClassVar[dict[str, str]] = {'': '%d B'} # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined class c_nvmlMemory_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access _fields_: _ClassVar[list[tuple[str, type]]] = [ # Structure format version (must be 2). ('version', _ctypes.c_uint), # Total physical device memory (in bytes). ('total', _ctypes.c_ulonglong), # Device memory (in bytes) reserved for system use (driver or firmware). ('reserved', _ctypes.c_ulonglong), # Unallocated device memory (in bytes). ('free', _ctypes.c_ulonglong), # Allocated device memory (in bytes). # Note that the driver/GPU always sets aside a small amount of memory for bookkeeping. ('used', _ctypes.c_ulonglong), ] _fmt_: _ClassVar[dict[str, str]] = {'': '%d B'} nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | 2 << 24) __get_memory_info_version_suffix = None c_nvmlMemory_t = c_nvmlMemory_v2_t def __determine_get_memory_info_version_suffix() -> str: global __get_memory_info_version_suffix, c_nvmlMemory_t # pylint: disable=global-statement if __get_memory_info_version_suffix is None: # pylint: disable-next=protected-access,no-member nvml_get_function_pointer = _pynvml._nvmlGetFunctionPointer __get_memory_info_version_suffix = '_v2' try: nvml_get_function_pointer('nvmlDeviceGetMemoryInfo_v2') except NVMLError_FunctionNotFound: LOGGER.debug('Failed to found symbol `nvmlDeviceGetMemoryInfo_v2`.') c_nvmlMemory_t = c_nvmlMemory_v1_t __get_memory_info_version_suffix = '' LOGGER.debug( 'NVML get memory info version 2 API is not available due to incompatible ' 'NVIDIA driver. Fallback to use NVML get memory info version 1 API.', ) else: LOGGER.debug('Found symbol `nvmlDeviceGetMemoryInfo_v2`.') LOGGER.debug('NVML get memory info version 2 is available.') return __get_memory_info_version_suffix def nvmlDeviceGetMemoryInfo( # pylint: disable=function-redefined handle: c_nvmlDevice_t, ) -> c_nvmlMemory_t: """Retrieve the amount of used, free, reserved and total memory available on the device, in bytes. Note: - The version 2 API adds additional memory information. The reserved amount is supported on version 2 only. - In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. Per-instance information can be queried by using specific MIG device handles. Raises: NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. NVMLError_NoPermission: If the user doesn't have permission to perform this operation. NVMLError_InvalidArgument: If device is invalid. NVMLError_GpuIsLost: If the target GPU has fallen off the bus or is otherwise inaccessible. NVMLError_Unknown: On any unexpected error. """ version_suffix = __determine_get_memory_info_version_suffix() if version_suffix == '_v2': c_memory = c_nvmlMemory_v2_t() c_memory.version = nvmlMemory_v2 # pylint: disable=attribute-defined-outside-init # pylint: disable-next=protected-access fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2') elif version_suffix in {'_v1', ''}: c_memory = c_nvmlMemory_v1_t() # pylint: disable-next=protected-access fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo') else: raise ValueError( f'Unknown version suffix {version_suffix!r} for ' 'function `nvmlDeviceGetMemoryInfo`.', ) ret = fn(handle, _ctypes.byref(c_memory)) _pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access return c_memory else: LOGGER.warning( 'Your installed package `nvidia-ml-py` is corrupted. ' 'Skip patch functions `nvmlDeviceGetMemoryInfo`. ' 'You may get incorrect or incomplete results. Please consider reinstall package ' '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.', ) # Add support for lookup fallback and context manager ############################################## class _CustomModule(_ModuleType): """Modified module type to support lookup fallback and context manager. Automatic lookup fallback: >>> libnvml.c_nvmlGpuInstance_t # fallback to pynvml.c_nvmlGpuInstance_t Context manager: >>> with libnvml: ... handle = libnvml.nvmlDeviceGetHandleByIndex(0) ... # The NVML context has been shutdown """ def __getattribute__(self, name: str) -> _Any | _Callable[..., _Any]: """Get a member from the current module. Fallback to the original package if missing.""" try: return super().__getattribute__(name) except AttributeError: return getattr(_pynvml, name) def __enter__(self) -> _Self: """Entry of the context manager for ``with`` statement.""" _lazy_init() return self def __exit__(self, *exc: object) -> None: """Shutdown the NVML context in the context manager for ``with`` statement.""" try: nvmlShutdown() except NVMLError: pass # Replace entry in sys.modules for this module with an instance of _CustomModule __modself = _sys.modules[__name__] __modself.__class__ = _CustomModule nvitop-1.4.2/nvitop/api/process.py000066400000000000000000001174131474547113600172000ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """The live classes for process running on the host and the GPU devices.""" # pylint: disable=too-many-lines from __future__ import annotations import contextlib import datetime import functools import os import threading from abc import ABC from types import FunctionType from typing import TYPE_CHECKING, Any from weakref import WeakValueDictionary from nvitop.api import host, libnvml from nvitop.api.utils import ( NA, UINT_MAX, NaType, Snapshot, bytes2human, memoize_when_activated, timedelta2human, ) if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable from typing_extensions import Self # Python 3.11+ from nvitop.api.device import Device __all__ = ['HostProcess', 'GpuProcess', 'command_join'] if host.POSIX: def add_quotes(s: str) -> str: """Return a shell-escaped version of the string.""" if s == '': return '""' if '$' not in s and '\\' not in s and '\n' not in s: if ' ' not in s: return s if '"' not in s: return f'"{s}"' if "'" not in s and '\n' not in s: return f"'{s}'" return '"{}"'.format( s.replace('\\', r'\\').replace('"', r'\"').replace('$', r'\$').replace('\n', r'\n'), ) elif host.WINDOWS: def add_quotes(s: str) -> str: """Return a shell-escaped version of the string.""" if s == '': return '""' if '%' not in s and '^' not in s and '\n' not in s: if ' ' not in s: return s if '"' not in s: return f'"{s}"' return '"{}"'.format( s.replace('^', '^^').replace('"', '^"').replace('%', '^%').replace('\n', r'\n'), ) else: def add_quotes(s: str) -> str: """Return a shell-escaped version of the string.""" return '"{}"'.format(s.replace('\n', r'\n')) def command_join(cmdline: list[str]) -> str: """Return a shell-escaped string from command line arguments.""" if len(cmdline) == 1 and not ( # May be modified by `setproctitle` os.path.isfile(cmdline[0]) and os.path.isabs(cmdline[0]) ): return cmdline[0] return ' '.join(map(add_quotes, cmdline)) _RAISE = object() _USE_FALLBACK_WHEN_RAISE = threading.local() # see also `GpuProcess.failsafe` def auto_garbage_clean( fallback: Any = _RAISE, ) -> Callable[[Callable[..., Any]], Callable[..., Any]]: """Remove the object references in the instance cache if the method call fails (the process is gone). The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise raises an exception when falls. """ def wrapper(func: Callable[..., Any]) -> Callable[..., Any]: @functools.wraps(func) def wrapped(self: GpuProcess, *args: Any, **kwargs: Any) -> Any: try: return func(self, *args, **kwargs) except host.PsutilError as ex: try: with GpuProcess.INSTANCE_LOCK: del GpuProcess.INSTANCES[self.pid, self.device] except (KeyError, AttributeError): pass try: with HostProcess.INSTANCE_LOCK: del HostProcess.INSTANCES[self.pid] except KeyError: pass # See also `GpuProcess.failsafe` if fallback is _RAISE or not getattr(_USE_FALLBACK_WHEN_RAISE, 'value', False): raise if isinstance(fallback, tuple): if isinstance(ex, host.AccessDenied) and fallback == ('No Such Process',): return ['No Permissions'] return list(fallback) return fallback return wrapped return wrapper class HostProcess(host.Process, ABC): """Represent an OS process with the given PID. If PID is omitted current process PID (:func:`os.getpid`) is used. The instance will be cache during the lifetime of the process. Examples: >>> HostProcess() # the current process HostProcess(pid=12345, name='python3', status='running', started='00:55:43') >>> p1 = HostProcess(12345) >>> p2 = HostProcess(12345) >>> p1 is p2 # the same instance True >>> import copy >>> copy.deepcopy(p1) is p1 # the same instance True >>> p = HostProcess(pid=12345) >>> p.cmdline() ['python3', '-c', 'import IPython; IPython.terminal.ipapp.launch_new_instance()'] >>> p.command() # the result is in shell-escaped format 'python3 -c "import IPython; IPython.terminal.ipapp.launch_new_instance()"' >>> p.as_snapshot() HostProcessSnapshot( real=HostProcess(pid=12345, name='python3', status='running', started='00:55:43'), cmdline=['python3', '-c', 'import IPython; IPython.terminal.ipapp.launch_new_instance()'], command='python3 -c "import IPython; IPython.terminal.ipapp.launch_new_instance()"', connections=[], cpu_percent=0.3, cpu_times=pcputimes(user=2.180019456, system=0.18424464, children_user=0.0, children_system=0.0), create_time=1656608143.31, cwd='/home/panxuehai', environ={...}, ... ) """ INSTANCE_LOCK: threading.RLock = threading.RLock() INSTANCES: WeakValueDictionary[int, HostProcess] = WeakValueDictionary() _pid: int _super_gone: bool _username: str | None _ident: tuple _lock: threading.RLock def __new__(cls, pid: int | None = None) -> Self: """Return the cached instance of :class:`HostProcess`.""" if pid is None: pid = os.getpid() with cls.INSTANCE_LOCK: try: instance = cls.INSTANCES[pid] if instance.is_running(): return instance except KeyError: pass instance = super().__new__(cls) instance._super_gone = False instance._username = None host.Process._init(instance, pid, True) try: host.Process.cpu_percent(instance) except host.PsutilError: pass cls.INSTANCES[pid] = instance return instance # pylint: disable-next=unused-argument,super-init-not-called def __init__(self, pid: int | None = None) -> None: """Initialize the instance.""" @property def _gone(self) -> bool: return self._super_gone @_gone.setter def _gone(self, value: bool) -> None: if value: with self.INSTANCE_LOCK: self.INSTANCES.pop(self.pid, None) self._super_gone = value def __repr__(self) -> str: """Return a string representation of the process.""" return super().__repr__().replace(self.__class__.__module__ + '.', '', 1) def __reduce__(self) -> tuple[type[HostProcess], tuple[int]]: """Return state information for pickling.""" return self.__class__, (self.pid,) if host.WINDOWS: def username(self) -> str: """The name of the user that owns the process. On Windows, the domain name will be removed if it is present. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ if self._username is None: # pylint: disable=access-member-before-definition self._username = ( # pylint: disable=attribute-defined-outside-init super().username().split('\\')[-1] ) return self._username else: def username(self) -> str: """The name of the user that owns the process. On UNIX this is calculated by using *real* process uid. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ if self._username is None: # pylint: disable=access-member-before-definition self._username = ( # pylint: disable=attribute-defined-outside-init super().username() ) return self._username @memoize_when_activated def cmdline(self) -> list[str]: """The command line this process has been called with. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ cmdline = super().cmdline() if len(cmdline) > 1: cmdline = '\0'.join(cmdline).rstrip('\0').split('\0') return cmdline def command(self) -> str: """Return a shell-escaped string from command line arguments. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return command_join(self.cmdline()) @memoize_when_activated def running_time(self) -> datetime.timedelta: """The elapsed time this process has been running in :class:`datetime.timedelta`. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return datetime.datetime.now() - datetime.datetime.fromtimestamp(self.create_time()) def running_time_human(self) -> str: """The elapsed time this process has been running in human readable format. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return timedelta2human(self.running_time()) def running_time_in_seconds(self) -> float: # in seconds """The elapsed time this process has been running in seconds. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return self.running_time().total_seconds() elapsed_time = running_time elapsed_time_human = running_time_human elapsed_time_in_seconds = running_time_in_seconds def rss_memory(self) -> int: # in bytes """The used resident set size (RSS) memory of the process in bytes. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return self.memory_info().rss def parent(self) -> HostProcess | None: """Return the parent process as a :class:`HostProcess` instance or :data:`None` if there is no parent. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ parent = super().parent() if parent is not None: return HostProcess(parent.pid) return None def children(self, recursive: bool = False) -> list[HostProcess]: """Return the children of this process as a list of :class:`HostProcess` instances. If *recursive* is :data:`True` return all the descendants. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ return [HostProcess(child.pid) for child in super().children(recursive)] @contextlib.contextmanager def oneshot(self) -> Generator[None]: """A utility context manager which considerably speeds up the retrieval of multiple process information at the same time. Internally different process info (e.g. name, ppid, uids, gids, ...) may be fetched by using the same routine, but only one information is returned and the others are discarded. When using this context manager the internal routine is executed once (in the example below on ``name()``) and the other info are cached. The cache is cleared when exiting the context manager block. The advice is to use this every time you retrieve more than one information about the process. Examples: >>> from nvitop import HostProcess >>> p = HostProcess() >>> with p.oneshot(): ... p.name() # collect multiple info ... p.cpu_times() # return cached value ... p.cpu_percent() # return cached value ... p.create_time() # return cached value """ # pylint: disable=line-too-long with self._lock: if hasattr(self, '_cache'): yield else: with super().oneshot(): # pylint: disable=no-member try: self.cmdline.cache_activate(self) # type: ignore[attr-defined] self.running_time.cache_activate(self) # type: ignore[attr-defined] yield finally: self.cmdline.cache_deactivate(self) # type: ignore[attr-defined] self.running_time.cache_deactivate(self) # type: ignore[attr-defined] def as_snapshot( self, attrs: Iterable[str] | None = None, ad_value: Any | None = None, ) -> Snapshot: """Return a onetime snapshot of the process.""" with self.oneshot(): attributes = self.as_dict(attrs=attrs, ad_value=ad_value) if attrs is None: for attr in ('command', 'running_time', 'running_time_human'): try: attributes[attr] = getattr(self, attr)() except (host.AccessDenied, host.ZombieProcess): # noqa: PERF203 attributes[attr] = ad_value return Snapshot(real=self, **attributes) @HostProcess.register class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-public-methods """Represent a process with the given PID running on the given GPU device. The instance will be cache during the lifetime of the process. The same host process can use multiple GPU devices. The :class:`GpuProcess` instances representing the same PID on the host but different GPU devices are different. """ INSTANCE_LOCK: threading.RLock = threading.RLock() INSTANCES: WeakValueDictionary[tuple[int, Device], GpuProcess] = WeakValueDictionary() _pid: int _host: HostProcess _device: Device _username: str | None _ident: tuple _hash: int | None # pylint: disable-next=too-many-arguments,unused-argument def __new__( cls, pid: int | None, device: Device, *, gpu_memory: int | NaType | None = None, gpu_instance_id: int | NaType | None = None, compute_instance_id: int | NaType | None = None, type: str | NaType | None = None, # pylint: disable=redefined-builtin ) -> Self: """Return the cached instance of :class:`GpuProcess`.""" if pid is None: pid = os.getpid() with cls.INSTANCE_LOCK: try: instance = cls.INSTANCES[pid, device] if instance.is_running(): return instance # type: ignore[return-value] except KeyError: pass instance = super().__new__(cls) instance._pid = pid instance._host = HostProcess(pid) instance._ident = (*instance._host._ident, device.index) instance._device = device instance._hash = None instance._username = None cls.INSTANCES[pid, device] = instance return instance # pylint: disable-next=too-many-arguments def __init__( self, pid: int | None, # pylint: disable=unused-argument device: Device, *, gpu_memory: int | NaType | None = None, gpu_instance_id: int | NaType | None = None, compute_instance_id: int | NaType | None = None, type: str | NaType | None = None, # pylint: disable=redefined-builtin ) -> None: """Initialize the instance returned by :meth:`__new__()`.""" if gpu_memory is None and not hasattr(self, '_gpu_memory'): gpu_memory = NA if gpu_memory is not None: self.set_gpu_memory(gpu_memory) if type is None and not hasattr(self, '_type'): type = NA if type is not None: self.type = type if gpu_instance_id is not None and compute_instance_id is not None: self._gpu_instance_id = gpu_instance_id if gpu_instance_id != UINT_MAX else NA self._compute_instance_id = ( compute_instance_id if compute_instance_id != UINT_MAX else NA ) elif device.is_mig_device(): self._gpu_instance_id = device.gpu_instance_id() self._compute_instance_id = device.compute_instance_id() else: self._gpu_instance_id = self._compute_instance_id = NA for util in ('sm', 'memory', 'encoder', 'decoder'): if not hasattr(self, f'_gpu_{util}_utilization'): setattr(self, f'_gpu_{util}_utilization', NA) def __repr__(self) -> str: """Return a string representation of the GPU process.""" return '{}(pid={}, gpu_memory={}, type={}, device={}, host={})'.format( # noqa: UP032 self.__class__.__name__, self.pid, self.gpu_memory_human(), self.type, self.device, self.host, ) def __eq__(self, other: object) -> bool: """Test equality to other object.""" if not isinstance(other, (GpuProcess, host.Process)): return NotImplemented return self._ident == other._ident def __hash__(self) -> int: """Return a hash value of the GPU process.""" if self._hash is None: # pylint: disable=access-member-before-definition self._hash = hash(self._ident) # pylint: disable=attribute-defined-outside-init return self._hash def __getattr__(self, name: str) -> Any | Callable[..., Any]: """Get a member from the instance or fallback to the host process instance if missing. Raises: AttributeError: If the attribute is not defined in either :class:`GpuProcess` nor :class:`HostProcess`. host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. """ try: return super().__getattr__(name) # type: ignore[misc] except AttributeError: if name == '_cache': raise attribute = getattr(self.host, name) if isinstance(attribute, FunctionType): attribute = auto_garbage_clean(fallback=_RAISE)(attribute) setattr(self, name, attribute) return attribute @property def pid(self) -> int: """The process PID.""" return self._pid @property def host(self) -> HostProcess: """The process instance running on the host.""" return self._host @property def device(self) -> Device: """The GPU device the process running on. The same host process can use multiple GPU devices. The :class:`GpuProcess` instances representing the same PID on the host but different GPU devices are different. """ return self._device def gpu_instance_id(self) -> int | NaType: """The GPU instance ID of the MIG device, or :const:`nvitop.NA` if not applicable.""" return self._gpu_instance_id def compute_instance_id(self) -> int | NaType: """The compute instance ID of the MIG device, or :const:`nvitop.NA` if not applicable.""" return self._compute_instance_id def gpu_memory(self) -> int | NaType: # in bytes """The used GPU memory in bytes, or :const:`nvitop.NA` if not applicable.""" return self._gpu_memory def gpu_memory_human(self) -> str | NaType: # in human readable """The used GPU memory in human readable format, or :const:`nvitop.NA` if not applicable.""" return self._gpu_memory_human def gpu_memory_percent(self) -> float | NaType: # in percentage """The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable.""" return self._gpu_memory_percent def gpu_sm_utilization(self) -> int | NaType: # in percentage """The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable.""" return self._gpu_sm_utilization def gpu_memory_utilization(self) -> int | NaType: # in percentage """The utilization rate of GPU memory bandwidth, or :const:`nvitop.NA` if not applicable.""" return self._gpu_memory_utilization def gpu_encoder_utilization(self) -> int | NaType: # in percentage """The utilization rate of the encoder, or :const:`nvitop.NA` if not applicable.""" return self._gpu_encoder_utilization def gpu_decoder_utilization(self) -> int | NaType: # in percentage """The utilization rate of the decoder, or :const:`nvitop.NA` if not applicable.""" return self._gpu_decoder_utilization def set_gpu_memory(self, value: int | NaType) -> None: """Set the used GPU memory in bytes.""" # pylint: disable=attribute-defined-outside-init self._gpu_memory = memory_used = value self._gpu_memory_human = bytes2human(self.gpu_memory()) memory_total = self.device.memory_total() gpu_memory_percent = NA if libnvml.nvmlCheckReturn(memory_used, int) and libnvml.nvmlCheckReturn(memory_total, int): gpu_memory_percent = round(100.0 * memory_used / memory_total, 1) # type: ignore[assignment] self._gpu_memory_percent = gpu_memory_percent def set_gpu_utilization( self, gpu_sm_utilization: int | NaType | None = None, gpu_memory_utilization: int | NaType | None = None, gpu_encoder_utilization: int | NaType | None = None, gpu_decoder_utilization: int | NaType | None = None, ) -> None: """Set the GPU utilization rates.""" # pylint: disable=attribute-defined-outside-init if gpu_sm_utilization is not None: self._gpu_sm_utilization = gpu_sm_utilization if gpu_memory_utilization is not None: self._gpu_memory_utilization = gpu_memory_utilization if gpu_encoder_utilization is not None: self._gpu_encoder_utilization = gpu_encoder_utilization if gpu_decoder_utilization is not None: self._gpu_decoder_utilization = gpu_decoder_utilization def update_gpu_status(self) -> int | NaType: """Update the GPU consumption status from a new NVML query.""" self.set_gpu_memory(NA) self.set_gpu_utilization(NA, NA, NA, NA) processes = self.device.processes() process = processes.get(self.pid, self) if process is not self: # The current process is gone and the instance has been removed from the cache. # Update GPU status from the new instance. self.set_gpu_memory(process.gpu_memory()) self.set_gpu_utilization( process.gpu_sm_utilization(), process.gpu_memory_utilization(), process.gpu_encoder_utilization(), process.gpu_decoder_utilization(), ) return self.gpu_memory() @property def type(self) -> str | NaType: """The type of the GPU context. The type is one of the following: - :data:`'C'`: compute context - :data:`'G'`: graphics context - :data:`'C+G'`: both compute context and graphics context - :data:`'N/A'`: not applicable """ return self._type @type.setter def type(self, value: str | NaType) -> None: if 'C' in value and 'G' in value: self._type = 'C+G' elif 'C' in value: self._type = 'C' elif 'G' in value: self._type = 'G' else: self._type = NA @auto_garbage_clean(fallback=False) def is_running(self) -> bool: """Return whether this process is running.""" return self.host.is_running() @auto_garbage_clean(fallback='terminated') def status(self) -> str: """The process current status. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.status() @auto_garbage_clean(fallback=NA) def create_time(self) -> float | NaType: """The process creation time as a floating point number expressed in seconds since the epoch. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.create_time() @auto_garbage_clean(fallback=NA) def running_time(self) -> datetime.timedelta | NaType: """The elapsed time this process has been running in :class:`datetime.timedelta`. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.running_time() def running_time_human(self) -> str | NaType: """The elapsed time this process has been running in human readable format. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return timedelta2human(self.running_time()) def running_time_in_seconds(self) -> float | NaType: """The elapsed time this process has been running in seconds. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ running_time = self.running_time() if running_time is NA: return NA return running_time.total_seconds() elapsed_time = running_time elapsed_time_human = running_time_human elapsed_time_in_seconds = running_time_in_seconds @auto_garbage_clean(fallback=NA) def username(self) -> str | NaType: """The name of the user that owns the process. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ if self._username is None: # pylint: disable=access-member-before-definition self._username = self.host.username() # pylint: disable=attribute-defined-outside-init return self._username @auto_garbage_clean(fallback=NA) def name(self) -> str | NaType: """The process name. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.name() @auto_garbage_clean(fallback=NA) def cpu_percent(self) -> float | NaType: # in percentage """Return a float representing the current process CPU utilization as a percentage. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.cpu_percent() @auto_garbage_clean(fallback=NA) def memory_percent(self) -> float | NaType: # in percentage """Compare process RSS memory to total physical system memory and calculate process memory utilization as a percentage. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ # pylint: disable=line-too-long return self.host.memory_percent() host_memory_percent = memory_percent # in percentage @auto_garbage_clean(fallback=NA) def host_memory(self) -> int | NaType: # in bytes """The used resident set size (RSS) memory of the process in bytes. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return self.host.rss_memory() def host_memory_human(self) -> str | NaType: """The used resident set size (RSS) memory of the process in human readable format. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return bytes2human(self.host_memory()) rss_memory = host_memory # in bytes # For `AccessDenied` error the fallback value is `['No Permissions']` @auto_garbage_clean(fallback=('No Such Process',)) def cmdline(self) -> list[str]: """The command line this process has been called with. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ cmdline = self.host.cmdline() if len(cmdline) == 0 and not self._gone: cmdline = ['Zombie Process'] return cmdline def command(self) -> str: """Return a shell-escaped string from command line arguments. Raises: host.NoSuchProcess: If the process is gone. host.AccessDenied: If the user do not have read privilege to the process' status file. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ return command_join(self.cmdline()) @auto_garbage_clean(fallback=_RAISE) def host_snapshot(self) -> Snapshot: """Return a onetime snapshot of the host process.""" with self.host.oneshot(): return Snapshot( real=self.host, is_running=self.is_running(), status=self.status(), username=self.username(), name=self.name(), cmdline=self.cmdline(), command=self.command(), cpu_percent=self.cpu_percent(), memory_percent=self.memory_percent(), host_memory=self.host_memory(), host_memory_human=self.host_memory_human(), running_time=self.running_time(), running_time_human=self.running_time_human(), running_time_in_seconds=self.running_time_in_seconds(), ) @auto_garbage_clean(fallback=_RAISE) def as_snapshot( self, *, host_process_snapshot_cache: dict[int, Snapshot] | None = None, ) -> Snapshot: """Return a onetime snapshot of the process on the GPU device. Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. Also, consider using the batched version to take snapshots with :meth:`GpuProcess.take_snapshots`, which caches the results and reduces redundant queries. See also :meth:`take_snapshots` and :meth:`failsafe`. """ host_process_snapshot_cache = host_process_snapshot_cache or {} try: host_snapshot = host_process_snapshot_cache[self.pid] except KeyError: host_snapshot = host_process_snapshot_cache[self.pid] = self.host_snapshot() return Snapshot( real=self, pid=self.pid, # host host=host_snapshot, is_running=host_snapshot.is_running, status=host_snapshot.status, username=host_snapshot.username, name=host_snapshot.name, cmdline=host_snapshot.cmdline, command=host_snapshot.command, cpu_percent=host_snapshot.cpu_percent, memory_percent=host_snapshot.memory_percent, host_memory=host_snapshot.host_memory, host_memory_human=host_snapshot.host_memory_human, running_time=host_snapshot.running_time, running_time_human=host_snapshot.running_time_human, running_time_in_seconds=host_snapshot.running_time_in_seconds, # device device=self.device, type=self.type, gpu_instance_id=self.gpu_instance_id(), compute_instance_id=self.compute_instance_id(), gpu_memory=self.gpu_memory(), gpu_memory_human=self.gpu_memory_human(), gpu_memory_percent=self.gpu_memory_percent(), gpu_sm_utilization=self.gpu_sm_utilization(), gpu_memory_utilization=self.gpu_memory_utilization(), gpu_encoder_utilization=self.gpu_encoder_utilization(), gpu_decoder_utilization=self.gpu_decoder_utilization(), ) @classmethod def take_snapshots( # batched version of `as_snapshot` cls, gpu_processes: Iterable[GpuProcess], *, failsafe: bool = False, ) -> list[Snapshot]: """Take snapshots for a list of :class:`GpuProcess` instances. If *failsafe* is :data:`True`, then if any method fails, the fallback value in :func:`auto_garbage_clean` will be used. """ cache: dict[int, Snapshot] = {} context: Callable[[], contextlib.AbstractContextManager[None]] = ( cls.failsafe if failsafe else contextlib.nullcontext ) with context(): return [ process.as_snapshot(host_process_snapshot_cache=cache) for process in gpu_processes ] @classmethod @contextlib.contextmanager def failsafe(cls) -> Generator[None]: """A context manager that enables fallback values for methods that fail. Examples: >>> p = GpuProcess(pid=10000, device=Device(0)) # process does not exist >>> p GpuProcess(pid=10000, gpu_memory=N/A, type=N/A, device=PhysicalDevice(index=0, name="NVIDIA GeForce RTX 3070", total_memory=8192MiB), host=HostProcess(pid=10000, status='terminated')) >>> p.cpu_percent() Traceback (most recent call last): ... NoSuchProcess: process no longer exists (pid=10000) >>> # Failsafe to the fallback value instead of raising exceptions ... with GpuProcess.failsafe(): ... print('fallback: {!r}'.format(p.cpu_percent())) ... print('fallback (float cast): {!r}'.format(float(p.cpu_percent()))) # `nvitop.NA` can be cast to float or int ... print('fallback (int cast): {!r}'.format(int(p.cpu_percent()))) # `nvitop.NA` can be cast to float or int fallback: 'N/A' fallback (float cast): nan fallback (int cast): 0 """ # pylint: disable=line-too-long global _USE_FALLBACK_WHEN_RAISE # pylint: disable=global-statement,global-variable-not-assigned prev_value = getattr(_USE_FALLBACK_WHEN_RAISE, 'value', False) try: _USE_FALLBACK_WHEN_RAISE.value = True yield finally: _USE_FALLBACK_WHEN_RAISE.value = prev_value nvitop-1.4.2/nvitop/api/termcolor.py000066400000000000000000000173221474547113600175260ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=wrong-spelling-in-comment # Vendored from the `termcolor` package: https://github.com/termcolor/termcolor # ============================================================================== # Copyright (c) 2008-2011 Volvox Development Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # Author: Konstantin Lepa # ============================================================================== """ANSI color formatting for output in terminal.""" from __future__ import annotations import io import os import sys from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from collections.abc import Iterable from typing_extensions import Literal # Python 3.8+ Attribute = Literal[ 'bold', 'dark', 'underline', 'blink', 'reverse', 'concealed', 'strike', ] Highlight = Literal[ 'on_black', 'on_grey', 'on_red', 'on_green', 'on_yellow', 'on_blue', 'on_magenta', 'on_cyan', 'on_light_grey', 'on_dark_grey', 'on_light_red', 'on_light_green', 'on_light_yellow', 'on_light_blue', 'on_light_magenta', 'on_light_cyan', 'on_white', ] Color = Literal[ 'black', 'grey', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'light_grey', 'dark_grey', 'light_red', 'light_green', 'light_yellow', 'light_blue', 'light_magenta', 'light_cyan', 'white', ] __all__ = ['colored', 'cprint'] if os.name == 'nt': # Windows try: from colorama import init except ImportError: pass else: init() ATTRIBUTES: dict[Attribute, int] = { 'bold': 1, 'dark': 2, 'underline': 4, 'blink': 5, 'reverse': 7, 'concealed': 8, 'strike': 9, } HIGHLIGHTS: dict[Highlight, int] = { 'on_black': 40, 'on_grey': 40, # Actually black but kept for backwards compatibility 'on_red': 41, 'on_green': 42, 'on_yellow': 43, 'on_blue': 44, 'on_magenta': 45, 'on_cyan': 46, 'on_light_grey': 47, 'on_dark_grey': 100, 'on_light_red': 101, 'on_light_green': 102, 'on_light_yellow': 103, 'on_light_blue': 104, 'on_light_magenta': 105, 'on_light_cyan': 106, 'on_white': 107, } COLORS: dict[Color, int] = { 'black': 30, 'grey': 30, # Actually black but kept for backwards compatibility 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 'magenta': 35, 'cyan': 36, 'light_grey': 37, 'dark_grey': 90, 'light_red': 91, 'light_green': 92, 'light_yellow': 93, 'light_blue': 94, 'light_magenta': 95, 'light_cyan': 96, 'white': 97, } RESET = '\033[0m' # pylint: disable-next=too-many-return-statements def _can_do_color( *, no_color: bool | None = None, force_color: bool | None = None, ) -> bool: """Check env vars and for tty/dumb terminal.""" # First check overrides: # "User-level configuration files and per-instance command-line arguments should # override $NO_COLOR. A user should be able to export $NO_COLOR in their shell # configuration file as a default, but configure a specific program in its # configuration file to specifically enable color." # https://no-color.org if no_color is not None and no_color: return False if force_color is not None and force_color: return True # Then check env vars: if 'ANSI_COLORS_DISABLED' in os.environ: return False if 'NO_COLOR' in os.environ: return False if 'FORCE_COLOR' in os.environ: return True # Then check system: if os.environ.get('TERM') == 'dumb': return False if not hasattr(sys.stdout, 'fileno'): return False try: return os.isatty(sys.stdout.fileno()) except io.UnsupportedOperation: return sys.stdout.isatty() # pylint: disable-next=too-many-arguments def colored( text: Any, color: Color | None = None, on_color: Highlight | None = None, attrs: Iterable[Attribute] | None = None, *, no_color: bool | None = None, force_color: bool | None = None, ) -> str: """Colorize text. Available text colors: black, red, green, yellow, blue, magenta, cyan, white, light_grey, dark_grey, light_red, light_green, light_yellow, light_blue, light_magenta, light_cyan. Available text highlights: on_black, on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white, on_light_grey, on_dark_grey, on_light_red, on_light_green, on_light_yellow, on_light_blue, on_light_magenta, on_light_cyan. Available attributes: bold, dark, underline, blink, reverse, concealed. Example: colored('Hello, World!', 'red', 'on_black', ['bold', 'blink']) colored('Hello, World!', 'green') """ result = str(text) if not _can_do_color(no_color=no_color, force_color=force_color): return result fmt_str = '\033[%dm%s' if color is not None: result = fmt_str % (COLORS[color], result) if on_color is not None: result = fmt_str % (HIGHLIGHTS[on_color], result) if attrs is not None: for attr in attrs: result = fmt_str % (ATTRIBUTES[attr], result) result += RESET return result # pylint: disable-next=too-many-arguments def cprint( text: object, color: Color | None = None, on_color: Highlight | None = None, attrs: Iterable[Attribute] | None = None, *, no_color: bool | None = None, force_color: bool | None = None, **kwargs: Any, ) -> None: """Print colorized text. It accepts arguments of print function. """ print( colored( text, color, on_color, attrs, no_color=no_color, force_color=force_color, ), **kwargs, ) nvitop-1.4.2/nvitop/api/utils.py000066400000000000000000000556231474547113600166660ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utilities of nvitop APIs.""" # pylint: disable=invalid-name from __future__ import annotations import ctypes import datetime import functools import math import os import re import sys import time from collections.abc import KeysView from typing import TYPE_CHECKING, Any, Callable, TypeVar from nvitop.api import termcolor if TYPE_CHECKING: from collections.abc import Generator, Iterable, Iterator __all__ = [ 'NA', 'NaType', 'NotApplicable', 'NotApplicableType', 'UINT_MAX', 'ULONGLONG_MAX', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'SIZE_UNITS', 'bytes2human', 'human2bytes', 'timedelta2human', 'utilization2string', 'colored', 'set_color', 'boolify', 'Snapshot', ] COLOR: bool = sys.stdout.isatty() def set_color(value: bool) -> None: """Force enable text coloring.""" global COLOR # pylint: disable=global-statement COLOR = bool(value) if COLOR: os.environ['FORCE_COLOR'] = '1' os.environ.pop('NO_COLOR', None) else: os.environ.pop('FORCE_COLOR', None) os.environ['NO_COLOR'] = '1' def colored( text: Any, color: termcolor.Color | None = None, on_color: termcolor.Highlight | None = None, attrs: Iterable[termcolor.Attribute] | None = None, ) -> str: """Colorize text with ANSI color escape codes. Available text colors: red, green, yellow, blue, magenta, cyan, white. Available text highlights: on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white. Available attributes: bold, dark, underline, blink, reverse, concealed. Examples: >>> colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink']) >>> colored('Hello, World!', 'green') """ if COLOR: return termcolor.colored(text, color=color, on_color=on_color, attrs=attrs) return str(text) class NaType(str): """A singleton (:const:`str: 'N/A'`) class represents a not applicable value. The :const:`NA` instance behaves like a :class:`str` instance (:const:`'N/A'`) when doing string manipulation (e.g. concatenation). For arithmetic operations, for example ``NA / 1024 / 1024``, it acts like the :data:`math.nan`. Examples: >>> NA 'N/A' >>> 'memory usage: {}'.format(NA) # NA is an instance of `str` 'memory usage: N/A' >>> NA.lower() # NA is an instance of `str` 'n/a' >>> NA.ljust(5) # NA is an instance of `str` 'N/A ' >>> NA + ' str' # string contamination if the operand is a string 'N/A str' >>> float(NA) # explicit conversion to float (`math.nan`) nan >>> NA + 1 # auto-casting to float if the operand is a number nan >>> NA * 1024 # auto-casting to float if the operand is a number nan >>> NA / (1024 * 1024) # auto-casting to float if the operand is a number nan """ # NOTE: Decorate this class with `@final` and remove `noqa` when we drop Python 3.7 support. def __new__(cls) -> NaType: # noqa: PYI034 """Get the singleton instance (:const:`nvitop.NA`).""" if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls, 'N/A') return cls._instance def __bool__(self) -> bool: """Convert :const:`NA` to :class:`bool` and return :data:`False`. >>> bool(NA) False """ return False def __int__(self) -> int: """Convert :const:`NA` to :class:`int` and return :const:`0`. >>> int(NA) 0 """ return 0 def __float__(self) -> float: """Convert :const:`NA` to :class:`float` and return :data:`math.nan`. >>> float(NA) nan >>> float(NA) is math.nan True """ return math.nan def __add__(self, other: object) -> str | float: # type: ignore[override] """Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``NA + other``). A special case is when the operand is :const:`nvitop.NA` itself, the result is :data:`math.nan` instead of :const:`'N/AN/A'`. >>> NA + ' str' 'N/A str' >>> NA + NA nan >>> NA + 1 nan >>> NA + 1.0 nan """ # pylint: disable=line-too-long if isinstance(other, (int, float)): return float(self) + other if other is NA: return float(self) return super().__add__(other) # type: ignore[operator] def __radd__(self, other: object) -> str | float: """Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``other + NA``). >>> 'str' + NA 'strN/A' >>> 1 + NA nan >>> 1.0 + NA nan """ # pylint: disable=line-too-long if isinstance(other, (int, float)): return other + float(self) return NotImplemented def __sub__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``NA - other``). >>> NA - 'str' TypeError: unsupported operand type(s) for -: 'NaType' and 'str' >>> NA - NA 'N/AN/A' >>> NA + 1 nan >>> NA + 1.0 nan """ if isinstance(other, (int, float)): return float(self) - other if other is NA: return float(self) return NotImplemented def __rsub__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``other - NA``). >>> 'str' - NA TypeError: unsupported operand type(s) for -: 'str' and 'NaType' >>> 1 - NA nan >>> 1.0 - NA nan """ if isinstance(other, (int, float)): return other - float(self) return NotImplemented def __mul__(self, other: object) -> float: # type: ignore[override] """Return :data:`math.nan` if the operand is a number (``NA * other``). A special case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`. >>> NA * 1024 nan >>> NA * 1024.0 nan >>> NA * NA nan """ if isinstance(other, (int, float)): return float(self) * other if other is NA: return float(self) return NotImplemented def __rmul__(self, other: object) -> float: # type: ignore[override] """Return :data:`math.nan` if the operand is a number (``other * NA``). >>> 1024 * NA nan >>> 1024.0 * NA nan """ if isinstance(other, (int, float)): return other * float(self) return NotImplemented def __truediv__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``NA / other``). >>> NA / 1024 nan >>> NA / 1024.0 nan >>> NA / 0 ZeroDivisionError: float division by zero >>> NA / 0.0 ZeroDivisionError: float division by zero >>> NA / NA nan """ if isinstance(other, (int, float)): return float(self) / other if other is NA: return float(self) return NotImplemented def __rtruediv__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``other / NA``). >>> 1024 / NA nan >>> 1024.0 / NA nan """ if isinstance(other, (int, float)): return other / float(self) return NotImplemented def __floordiv__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``NA // other``). >>> NA // 1024 nan >>> NA // 1024.0 nan >>> NA / 0 ZeroDivisionError: float division by zero >>> NA / 0.0 ZeroDivisionError: float division by zero >>> NA // NA nan """ if isinstance(other, (int, float)): return float(self) // other if other is NA: return float(self) return NotImplemented def __rfloordiv__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``other // NA``). >>> 1024 // NA nan >>> 1024.0 // NA nan """ if isinstance(other, (int, float)): return other // float(self) return NotImplemented def __mod__(self, other: object) -> float: # type: ignore[override] """Return :data:`math.nan` if the operand is a number (``NA % other``). >>> NA % 1024 nan >>> NA % 1024.0 nan >>> NA % 0 ZeroDivisionError: float modulo >>> NA % 0.0 ZeroDivisionError: float modulo """ if isinstance(other, (int, float)): return float(self) % other if other is NA: return float(self) return NotImplemented def __rmod__(self, other: object) -> float: """Return :data:`math.nan` if the operand is a number (``other % NA``). >>> 1024 % NA nan >>> 1024.0 % NA nan """ if isinstance(other, (int, float)): return other % float(self) return NotImplemented def __divmod__(self, other: object) -> tuple[float, float]: """The pair ``(NA // other, NA % other)`` (``divmod(NA, other)``). >>> divmod(NA, 1024) (nan, nan) >>> divmod(NA, 1024.0) (nan, nan) >>> divmod(NA, 0) ZeroDivisionError: float floor division by zero >>> divmod(NA, 0.0) ZeroDivisionError: float floor division by zero """ return (self // other, self % other) def __rdivmod__(self, other: object) -> tuple[float, float]: """The pair ``(other // NA, other % NA)`` (``divmod(other, NA)``). >>> divmod(1024, NA) (nan, nan) >>> divmod(1024.0, NA) (nan, nan) """ return (other // self, other % self) def __pos__(self) -> float: """Return :data:`math.nan` (``+NA``). >>> +NA nan """ return +float(self) def __neg__(self) -> float: """Return :data:`math.nan` (``-NA``). >>> -NA nan """ return -float(self) def __abs__(self) -> float: """Return :data:`math.nan` (``abs(NA)``). >>> abs(NA) nan """ return abs(float(self)) def __round__(self, ndigits: int | None = None) -> int | float: """Round :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`. If ``ndigits`` is omitted or :data:`None`, returns :const:`0`, otherwise returns :data:`math.nan`. >>> round(NA) 0 >>> round(NA, 0) nan >>> round(NA, 1) nan """ if ndigits is None: return int(self) return round(float(self), ndigits) def __lt__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__lt__(x) # type: ignore[operator] def __le__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__le__(x) # type: ignore[operator] def __gt__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__gt__(x) # type: ignore[operator] def __ge__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__ge__(x) # type: ignore[operator] def __format__(self, format_spec: str) -> str: """Format :const:`nvitop.NA` according to ``format_spec``.""" try: return super().__format__(format_spec) except ValueError: return format(math.nan, format_spec) NotApplicableType = NaType # isinstance(NA, str) -> True # NA == 'N/A' -> True # NA is NaType() -> True (`NaType` is a singleton class) NA = NaType() """The singleton instance of :class:`NaType`. The actual value is :const:`str: 'N/A'`.""" NotApplicable = NA """The singleton instance of :class:`NaType`. The actual value is :const:`str: 'N/A'`.""" UINT_MAX: int = ctypes.c_uint(-1).value # 0xFFFFFFFF """The maximum value of :class:`ctypes.c_uint`.""" ULONGLONG_MAX: int = ctypes.c_ulonglong(-1).value # 0XFFFFFFFFFFFFFFFF """The maximum value of :class:`ctypes.c_ulonglong`.""" KiB: int = 1 << 10 """Kibibyte (1024)""" MiB: int = 1 << 20 """Mebibyte (1024 * 1024)""" GiB: int = 1 << 30 """Gibibyte (1024 * 1024 * 1024)""" TiB: int = 1 << 40 """Tebibyte (1024 * 1024 * 1024 * 1024)""" PiB: int = 1 << 50 """Pebibyte (1024 * 1024 * 1024 * 1024 * 1024)""" SIZE_UNITS: dict[str | None, int] = { None: 1, '': 1, 'B': 1, 'KiB': KiB, 'MiB': MiB, 'GiB': GiB, 'TiB': TiB, 'PiB': PiB, 'KB': 1000, 'MB': 1000**2, 'GB': 1000**3, 'TB': 1000**4, 'PB': 1000**5, } """Units of storage and memory measurements.""" SIZE_PATTERN: re.Pattern = re.compile( r'^\s*\+?\s*(?P\d+(?:\.\d+)?)\s*(?P[KMGTP]i?B?|B?)\s*$', flags=re.IGNORECASE, ) """The regex pattern for human readable size.""" # pylint: disable-next=too-many-return-statements,too-many-branches def bytes2human( b: int | float | NaType, # noqa: PYI041 *, min_unit: int = 1, ) -> str: """Convert bytes to a human readable string.""" if b == NA: return NA if not isinstance(b, int): try: b = round(float(b)) except ValueError: return NA if b < KiB and min_unit < KiB: return f'{b}B' if b < MiB and min_unit <= KiB: return f'{round(b / KiB)}KiB' if b < 100 * MiB and min_unit <= MiB: return f'{round(b / MiB, 2):.2f}MiB' if b < 1000 * MiB and min_unit <= MiB: return f'{round(b / MiB, 1):.1f}MiB' if b < 20 * GiB and min_unit <= MiB: return f'{round(b / MiB)}MiB' if b < 100 * GiB and min_unit <= GiB: return f'{round(b / GiB, 2):.2f}GiB' if b < 1000 * GiB and min_unit <= GiB: return f'{round(b / GiB, 1):.1f}GiB' if b < 100 * TiB and min_unit <= TiB: return f'{round(b / TiB, 2):.2f}TiB' if b < 1000 * TiB and min_unit <= TiB: return f'{round(b / TiB, 1):.1f}TiB' if b < 100 * PiB: return f'{round(b / PiB, 2):.2f}PiB' return f'{round(b / PiB, 1):.1f}PiB' def human2bytes(s: int | str) -> int: """Convert a human readable size string (*case insensitive*) to bytes. Raises: ValueError: If cannot convert the given size string. Examples: >>> human2bytes('500B') 500 >>> human2bytes('10k') 10000 >>> human2bytes('10ki') 10240 >>> human2bytes('1M') 1000000 >>> human2bytes('1MiB') 1048576 >>> human2bytes('1.5GiB') 1610612736 """ if isinstance(s, int): if s >= 0: return s raise ValueError(f'Cannot convert {s!r} to bytes.') match = SIZE_PATTERN.match(s) if match is None: raise ValueError(f'Cannot convert {s!r} to bytes.') size, unit = match.groups() unit = unit.upper().replace('I', 'i').replace('B', '') + 'B' return int(float(size) * SIZE_UNITS[unit]) def timedelta2human( dt: int | float | datetime.timedelta | NaType, # noqa: PYI041 *, round: bool = False, # pylint: disable=redefined-builtin ) -> str: """Convert a number in seconds or a :class:`datetime.timedelta` instance to a human readable string.""" if isinstance(dt, (int, float)): dt = datetime.timedelta(seconds=dt) if not isinstance(dt, datetime.timedelta): return NA if dt.days >= 4 or (round and dt.days >= 1): return f'{dt.days + dt.seconds / 86400:.1f} days' hours, seconds = divmod(86400 * dt.days + dt.seconds, 3600) if hours > 0: return '{:d}:{:02d}:{:02d}'.format(hours, *divmod(seconds, 60)) return '{:d}:{:02d}'.format(*divmod(seconds, 60)) def utilization2string(utilization: int | float | NaType) -> str: # noqa: PYI041 """Convert a utilization rate to string.""" if utilization != NA: if isinstance(utilization, int): return f'{utilization}%' if isinstance(utilization, float): return f'{utilization:.1f}%' return NA def boolify(string: str, default: Any = None) -> bool: """Convert the given value, usually a string, to boolean.""" if string.lower() in {'true', 'yes', 'on', 'enabled', '1'}: return True if string.lower() in {'false', 'no', 'off', 'disabled', '0'}: return False if default is not None: return bool(default) return bool(string) class Snapshot: """A dict-like object holds the snapshot values. The value can be accessed by ``snapshot.name`` or ``snapshot['name']`` syntax. The Snapshot can also be converted to a dictionary by ``dict(snapshot)`` or ``{**snapshot}``. Missing attributes will be automatically fetched from the original object. """ def __init__(self, real: Any, **items: Any) -> None: """Initialize a new :class:`Snapshot` object with the given attributes.""" self.real = real self.timestamp = time.time() for key, value in items.items(): setattr(self, key, value) def __repr__(self) -> str: """Return a string representation of the snapshot.""" keys = set(self.__dict__.keys()).difference({'real', 'timestamp'}) keys = ['real', *sorted(keys)] keyvals = [] for key in keys: value = getattr(self, key) keyval = f'{key}={value!r}' if isinstance(value, Snapshot): keyval = keyval.replace('\n', '\n ') # extra indentation for nested snapshots keyvals.append(keyval) return '{}{}(\n {},\n)'.format( self.real.__class__.__name__, self.__class__.__name__, ',\n '.join(keyvals), ) def __hash__(self) -> int: """Return a hash value of the snapshot.""" return hash((self.real, self.timestamp)) def __getattr__(self, name: str) -> Any: """Get a member from the instance. If the attribute is not defined, fetches from the original object and makes a function call. """ try: return super().__getattr__(name) # type: ignore[misc] except AttributeError: attribute = getattr(self.real, name) if callable(attribute): attribute = attribute() setattr(self, name, attribute) return attribute def __getitem__(self, name: str) -> Any: """Support ``snapshot['name']`` syntax.""" try: return getattr(self, name) except AttributeError as ex: raise KeyError(name) from ex def __setitem__(self, name: str, value: Any) -> None: """Support ``snapshot['name'] = value`` syntax.""" setattr(self, name, value) def __iter__(self) -> Iterator[str]: """Support ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax.""" def gen() -> Generator[str]: yield from (name for name in self.__dict__ if name not in {'real', 'timestamp'}) return gen() def keys(self) -> Iterable[str]: # pylint: disable-next=line-too-long """Support ``**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and ``dict(snapshot)`` dictionary conversion.""" return KeysView(self) # type: ignore[arg-type] Method = TypeVar('Method', bound=Callable[..., Any]) # Modified from psutil (https://github.com/giampaolo/psutil) def memoize_when_activated(method: Method) -> Method: """A memoize decorator which is disabled by default. It can be activated and deactivated on request. For efficiency reasons it can be used only against class methods accepting no arguments. """ @functools.wraps(method) def wrapped(self: object, *args: Any, **kwargs: Any) -> Any: try: # case 1: we previously entered oneshot() ctx # pylint: disable-next=protected-access ret = self._cache[method] # type: ignore[attr-defined] except AttributeError: # case 2: we never entered oneshot() ctx return method(self, *args, **kwargs) except KeyError: # case 3: we entered oneshot() ctx but there's no cache # for this entry yet ret = method(self, *args, **kwargs) try: # pylint: disable-next=protected-access self._cache[method] = ret # type: ignore[attr-defined] except AttributeError: # multi-threading race condition, see: # https://github.com/giampaolo/psutil/issues/1948 pass return ret def cache_activate(self: object) -> None: """Activate cache. Expects an instance. Cache will be stored as a "_cache" instance attribute. """ if not hasattr(self, '_cache'): # pylint: disable-next=protected-access self._cache = {} # type: ignore[attr-defined] def cache_deactivate(self: object) -> None: """Deactivate and clear cache.""" try: # pylint: disable-next=protected-access del self._cache # type: ignore[attr-defined] except AttributeError: pass wrapped.cache_activate = cache_activate # type: ignore[attr-defined] wrapped.cache_deactivate = cache_deactivate # type: ignore[attr-defined] return wrapped # type: ignore[return-value] nvitop-1.4.2/nvitop/callbacks/000077500000000000000000000000001474547113600163075ustar00rootroot00000000000000nvitop-1.4.2/nvitop/callbacks/LICENSE000066400000000000000000000261501474547113600173200ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021-2025 Xuehai Pan. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. nvitop-1.4.2/nvitop/callbacks/__init__.py000066400000000000000000000013661474547113600204260ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== nvitop-1.4.2/nvitop/callbacks/keras.py000066400000000000000000000167211474547113600177750ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=missing-module-docstring,missing-function-docstring # pylint: disable=unused-argument,attribute-defined-outside-init from __future__ import annotations import re import time # pylint: disable-next=import-error,no-name-in-module from tensorflow.python.keras.callbacks import Callback from nvitop.api import libnvml from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats # Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model. Args: memory_utilization (bool): Set to :data:`True` to log used, free and the percentage of memory utilization at the start and end of each step. Default: :data:`True`. gpu_utilization (bool): Set to :data:`True` to log the percentage of GPU utilization at the start and end of each step. Default: :data:`True`. intra_step_time (bool): Set to :data:`True` to log the time of each step. Default: :data:`False`. inter_step_time (bool): Set to :data:`True` to log the time between the end of one step and the start of the next step. Default: :data:`False`. fan_speed (bool): Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. temperature (bool): Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. Raises: ValueError: If NVIDIA driver is not installed, or the `gpus` argument does not match available devices. Examples: >>> from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model >>> from tensorflow.python.keras.callbacks import TensorBoard >>> from nvitop.callbacks.keras import GpuStatsLogger >>> gpus = ['/gpu:0', '/gpu:1'] # or gpus = [0, 1] or gpus = 2 >>> model = Xception(weights=None, ..) >>> model = multi_gpu_model(model, gpus) >>> model.compile(..) >>> tb_callback = TensorBoard(log_dir='./logs') >>> gpu_stats = GpuStatsLogger(gpus) >>> model.fit(.., callbacks=[gpu_stats, tb_callback]) Note:: The GpuStatsLogger callback should be placed before the TensorBoard / CSVLogger callback. GPU stats are mainly based on NVML queries. The description of the queries is as follows: - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. - **memory.used** - Total memory allocated by active contexts, in MiBs. - **memory.free** - Total free memory, in MiBs. - **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - **utilization.memory** - Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. - **temperature** - Core GPU temperature, in degrees C. """ GPU_NAME_PATTERN = re.compile(r'^/(\w*device:)?GPU:(?P\d+)$', flags=re.IGNORECASE) def __init__( # pylint: disable=too-many-arguments self, gpus: int | list[int | str] | tuple[int | str, ...], *, memory_utilization: bool = True, gpu_utilization: bool = True, intra_step_time: bool = False, inter_step_time: bool = False, fan_speed: bool = False, temperature: bool = False, ) -> None: super().__init__() try: libnvml.nvmlInit() except libnvml.NVMLError as ex: raise ValueError( 'Cannot use the GpuStatsLogger callback because the NVIDIA driver is not installed.', ) from ex if isinstance(gpus, (list, tuple)): gpus = list(gpus) for i, gpu_id in enumerate(gpus): if isinstance(gpu_id, str) and self.GPU_NAME_PATTERN.match(gpu_id): gpus[i] = self.GPU_NAME_PATTERN.match(gpu_id).group('ID') gpu_ids = sorted(set(map(int, gpus))) else: gpu_ids = list(range(gpus)) try: self._devices = get_devices_by_logical_ids(gpu_ids, unique=True) except (libnvml.NVMLError, RuntimeError) as ex: raise ValueError( f'Cannot use GpuStatsLogger callback because devices unavailable. ' f'Received: `gpus={gpu_ids}`', ) from ex self._memory_utilization = memory_utilization self._gpu_utilization = gpu_utilization self._intra_step_time = intra_step_time self._inter_step_time = inter_step_time self._fan_speed = fan_speed self._temperature = temperature def on_train_epoch_start(self, epoch, logs=None) -> None: self._snap_intra_step_time = None self._snap_inter_step_time = None def on_train_batch_start(self, batch, logs=None) -> None: logs = logs or {} if self._intra_step_time: self._snap_intra_step_time = time.monotonic() logs.update(self._get_gpu_stats()) if self._inter_step_time and self._snap_inter_step_time: # First log at beginning of second step logs['batch_time/inter_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_inter_step_time ) def on_train_batch_end(self, batch, logs=None) -> None: logs = logs or {} if self._inter_step_time: self._snap_inter_step_time = time.monotonic() logs.update(self._get_gpu_stats()) if self._intra_step_time and self._snap_intra_step_time: logs['batch_time/intra_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_intra_step_time ) def _get_gpu_stats(self) -> dict[str, float]: """Get the gpu status from NVML queries.""" return get_gpu_stats( devices=self._devices, memory_utilization=self._memory_utilization, gpu_utilization=self._gpu_utilization, fan_speed=self._fan_speed, temperature=self._temperature, ) nvitop-1.4.2/nvitop/callbacks/lightning.py000066400000000000000000000172531474547113600206540ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=missing-module-docstring,missing-function-docstring # pylint: disable=unused-argument,attribute-defined-outside-init from __future__ import annotations import time from typing import TYPE_CHECKING, Any from lightning.pytorch.callbacks import Callback # pylint: disable=import-error from lightning.pytorch.utilities import rank_zero_only # pylint: disable=import-error from lightning.pytorch.utilities.exceptions import ( # pylint: disable=import-error MisconfigurationException, ) from nvitop.api import libnvml from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats if TYPE_CHECKING: import lightning.pytorch as pl # Modified from pytorch_lightning.callbacks.GPUStatsMonitor class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and in order to use it you need to assign a logger in the ``Trainer``. Args: memory_utilization (bool): Set to :data:`True` to log used, free and the percentage of memory utilization at the start and end of each step. Default: :data:`True`. gpu_utilization (bool): Set to :data:`True` to log the percentage of GPU utilization at the start and end of each step. Default: :data:`True`. intra_step_time (bool): Set to :data:`True` to log the time of each step. Default: :data:`False`. inter_step_time (bool): Set to :data:`True` to log the time between the end of one step and the start of the next step. Default: :data:`False`. fan_speed (bool): Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. temperature (bool): Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. Raises: MisconfigurationException: If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger. Examples: >>> from lightning.pytorch import Trainer >>> from nvitop.callbacks.lightning import GpuStatsLogger >>> gpu_stats = GpuStatsLogger() >>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats]) GPU stats are mainly based on NVML queries. The description of the queries is as follows: - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. - **memory.used** - Total memory allocated by active contexts, in MiBs. - **memory.free** - Total free memory, in MiBs. - **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - **utilization.memory** - Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. - **temperature** - Core GPU temperature, in degrees C. """ def __init__( # pylint: disable=too-many-arguments self, *, memory_utilization: bool = True, gpu_utilization: bool = True, intra_step_time: bool = False, inter_step_time: bool = False, fan_speed: bool = False, temperature: bool = False, ) -> None: super().__init__() try: libnvml.nvmlInit() except libnvml.NVMLError as ex: raise MisconfigurationException( 'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.', ) from ex self._memory_utilization = memory_utilization self._gpu_utilization = gpu_utilization self._intra_step_time = intra_step_time self._inter_step_time = inter_step_time self._fan_speed = fan_speed self._temperature = temperature def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: if not trainer.logger: raise MisconfigurationException( 'Cannot use GpuStatsLogger callback with Trainer that has no logger.', ) if trainer.strategy.root_device.type != 'cuda': raise MisconfigurationException( f'You are using GpuStatsLogger but are not running on GPU. ' f'The root device type is {trainer.strategy.root_device.type}.', ) device_ids = trainer.device_ids try: self._devices = get_devices_by_logical_ids(device_ids, unique=True) except (libnvml.NVMLError, RuntimeError) as ex: raise ValueError( f'Cannot use GpuStatsLogger callback because devices unavailable. ' f'Received: `gpus={device_ids}`', ) from ex def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: self._snap_intra_step_time = None self._snap_inter_step_time = None @rank_zero_only def on_train_batch_start( # pylint: disable=arguments-differ self, trainer: pl.Trainer, pl_module: pl.LightningModule, **kwargs: Any, ) -> None: if self._intra_step_time: self._snap_intra_step_time = time.monotonic() logs = self._get_gpu_stats() if self._inter_step_time and self._snap_inter_step_time: # First log at beginning of second step logs['batch_time/inter_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_inter_step_time ) trainer.logger.log_metrics(logs, step=trainer.global_step) @rank_zero_only def on_train_batch_end( # pylint: disable=arguments-differ self, trainer: pl.Trainer, pl_module: pl.LightningModule, **kwargs: Any, ) -> None: if self._inter_step_time: self._snap_inter_step_time = time.monotonic() logs = self._get_gpu_stats() if self._intra_step_time and self._snap_intra_step_time: logs['batch_time/intra_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_intra_step_time ) trainer.logger.log_metrics(logs, step=trainer.global_step) def _get_gpu_stats(self) -> dict[str, float]: """Get the gpu status from NVML queries.""" return get_gpu_stats( devices=self._devices, memory_utilization=self._memory_utilization, gpu_utilization=self._gpu_utilization, fan_speed=self._fan_speed, temperature=self._temperature, ) nvitop-1.4.2/nvitop/callbacks/pytorch_lightning.py000066400000000000000000000175301474547113600224220ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=missing-module-docstring,missing-function-docstring # pylint: disable=unused-argument,attribute-defined-outside-init from __future__ import annotations import time from typing import TYPE_CHECKING, Any from pytorch_lightning.callbacks import Callback # pylint: disable=import-error from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error from pytorch_lightning.utilities.exceptions import ( # pylint: disable=import-error MisconfigurationException, ) from nvitop.api import libnvml from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats if TYPE_CHECKING: import pytorch_lightning as pl # Modified from pytorch_lightning.callbacks.GPUStatsMonitor class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and in order to use it you need to assign a logger in the ``Trainer``. Args: memory_utilization (bool): Set to :data:`True` to log used, free and the percentage of memory utilization at the start and end of each step. Default: :data:`True`. gpu_utilization (bool): Set to :data:`True` to log the percentage of GPU utilization at the start and end of each step. Default: :data:`True`. intra_step_time (bool): Set to :data:`True` to log the time of each step. Default: :data:`False`. inter_step_time (bool): Set to :data:`True` to log the time between the end of one step and the start of the next step. Default: :data:`False`. fan_speed (bool): Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. temperature (bool): Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. Raises: MisconfigurationException: If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger. Examples: >>> from pytorch_lightning import Trainer >>> from nvitop.callbacks.pytorch_lightning import GpuStatsLogger >>> gpu_stats = GpuStatsLogger() >>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats]) GPU stats are mainly based on NVML queries. The description of the queries is as follows: - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. - **memory.used** - Total memory allocated by active contexts, in MiBs. - **memory.free** - Total free memory, in MiBs. - **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - **utilization.memory** - Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. - **temperature** - Core GPU temperature, in degrees C. """ def __init__( # pylint: disable=too-many-arguments self, *, memory_utilization: bool = True, gpu_utilization: bool = True, intra_step_time: bool = False, inter_step_time: bool = False, fan_speed: bool = False, temperature: bool = False, ) -> None: super().__init__() try: libnvml.nvmlInit() except libnvml.NVMLError as ex: raise MisconfigurationException( 'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.', ) from ex self._memory_utilization = memory_utilization self._gpu_utilization = gpu_utilization self._intra_step_time = intra_step_time self._inter_step_time = inter_step_time self._fan_speed = fan_speed self._temperature = temperature def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: if not trainer.logger: raise MisconfigurationException( 'Cannot use GpuStatsLogger callback with Trainer that has no logger.', ) if trainer.strategy.root_device.type != 'cuda': raise MisconfigurationException( f'You are using GpuStatsLogger but are not running on GPU. ' f'The root device type is {trainer.strategy.root_device.type}.', ) try: device_ids = trainer.device_ids # pytorch-lightning >= 1.6.0 except AttributeError: device_ids = trainer.data_parallel_device_ids # pytorch-lightning < 1.6.0 try: self._devices = get_devices_by_logical_ids(device_ids, unique=True) except (libnvml.NVMLError, RuntimeError) as ex: raise ValueError( f'Cannot use GpuStatsLogger callback because devices unavailable. ' f'Received: `gpus={device_ids}`', ) from ex def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: self._snap_intra_step_time = None self._snap_inter_step_time = None @rank_zero_only def on_train_batch_start( # pylint: disable=arguments-differ self, trainer: pl.Trainer, pl_module: pl.LightningModule, **kwargs: Any, ) -> None: if self._intra_step_time: self._snap_intra_step_time = time.monotonic() logs = self._get_gpu_stats() if self._inter_step_time and self._snap_inter_step_time: # First log at beginning of second step logs['batch_time/inter_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_inter_step_time ) trainer.logger.log_metrics(logs, step=trainer.global_step) @rank_zero_only def on_train_batch_end( # pylint: disable=arguments-differ self, trainer: pl.Trainer, pl_module: pl.LightningModule, **kwargs: Any, ) -> None: if self._inter_step_time: self._snap_inter_step_time = time.monotonic() logs = self._get_gpu_stats() if self._intra_step_time and self._snap_intra_step_time: logs['batch_time/intra_step (ms)'] = 1000.0 * ( time.monotonic() - self._snap_intra_step_time ) trainer.logger.log_metrics(logs, step=trainer.global_step) def _get_gpu_stats(self) -> dict[str, float]: """Get the gpu status from NVML queries.""" return get_gpu_stats( devices=self._devices, memory_utilization=self._memory_utilization, gpu_utilization=self._gpu_utilization, fan_speed=self._fan_speed, temperature=self._temperature, ) nvitop-1.4.2/nvitop/callbacks/tensorboard.py000066400000000000000000000030701474547113600212030ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=missing-module-docstring from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: import numpy as np try: from tensorboard.summary import Writer as SummaryWriter except ImportError: try: from tensorboardX import SummaryWriter except ImportError: pass def add_scalar_dict( writer: SummaryWriter, main_tag: str, tag_scalar_dict: dict[str, int | float | np.floating], global_step: int | np.integer | None = None, walltime: float | None = None, ) -> None: """Add a batch of scalars to the writer. Batched version of ``writer.add_scalar``. """ for tag, scalar in tag_scalar_dict.items(): writer.add_scalar(f'{main_tag}/{tag}', scalar, global_step=global_step, walltime=walltime) nvitop-1.4.2/nvitop/callbacks/utils.py000066400000000000000000000050541474547113600200250ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # pylint: disable=missing-module-docstring,missing-function-docstring from __future__ import annotations from nvitop.api import CudaDevice, Device, MiB def get_devices_by_logical_ids(device_ids: list[int], unique: bool = True) -> list[CudaDevice]: cuda_devices = CudaDevice.from_indices(device_ids) devices = [] presented = set() for device in cuda_devices: if device.cuda_index in presented and unique: continue devices.append(device) presented.add(device.cuda_index) return devices def get_gpu_stats( devices: list[Device], memory_utilization: bool = True, gpu_utilization: bool = True, fan_speed: bool = False, temperature: bool = False, ) -> dict[str, float]: """Get the GPU status from NVML queries.""" stats = {} for device in devices: prefix = f'gpu_id: {device.cuda_index}' if device.cuda_index != device.physical_index: prefix += f' (physical index: {device.physical_index})' with device.oneshot(): if memory_utilization or gpu_utilization: utilization = device.utilization_rates() if memory_utilization: stats[f'{prefix}/utilization.memory (%)'] = float(utilization.memory) if gpu_utilization: stats[f'{prefix}/utilization.gpu (%)'] = float(utilization.gpu) if memory_utilization: stats[f'{prefix}/memory.used (MiB)'] = float(device.memory_used()) / MiB stats[f'{prefix}/memory.free (MiB)'] = float(device.memory_free()) / MiB if fan_speed: stats[f'{prefix}/fan.speed (%)'] = float(device.fan_speed()) if temperature: stats[f'{prefix}/temperature.gpu (C)'] = float(device.fan_speed()) return stats nvitop-1.4.2/nvitop/cli.py000066400000000000000000000343431474547113600155200ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. """The interactive NVIDIA-GPU process viewer.""" import argparse import curses import os import sys import textwrap from nvitop.api import HostProcess, libnvml from nvitop.tui import TUI, USERNAME, Device, colored, libcurses, set_color, setlocale_utf8 from nvitop.version import __version__ TTY = sys.stdin.isatty() and sys.stdout.isatty() NVITOP_MONITOR_MODE = set( map( str.strip, os.environ.get('NVITOP_MONITOR_MODE', '').lower().split(','), ), ) # pylint: disable=too-many-branches,too-many-statements def parse_arguments() -> argparse.Namespace: """Parse command-line arguments for ``nvitop``.""" coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format( colored('light', 'green'), colored('moderate', 'yellow'), colored('heavy', 'red'), ) def posfloat(argstring: str) -> float: num = float(argstring) if num <= 0: raise ValueError return num posfloat.__name__ = 'positive float' parser = argparse.ArgumentParser( prog='nvitop', description='An interactive NVIDIA-GPU process viewer.', formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument( '--help', '-h', dest='help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.', ) parser.add_argument( '--version', '-V', dest='version', action='version', version=f'%(prog)s {__version__}', help="Show %(prog)s's version number and exit.", ) mode = parser.add_mutually_exclusive_group() mode.add_argument( '--once', '-1', dest='once', action='store_true', help='Report query data only once.', ) mode.add_argument( '--monitor', '-m', dest='monitor', type=str, default=argparse.SUPPRESS, nargs='?', choices=['auto', 'full', 'compact'], help=( 'Run as a resource monitor. Continuously report query data and handle user inputs.\n' 'If the argument is omitted, the value from `NVITOP_MONITOR_MODE` will be used.\n' '(default fallback mode: auto)' ), ) parser.add_argument( '--interval', dest='interval', type=posfloat, default=None, metavar='SEC', help='Process status update interval in seconds. (default: 2)', ) parser.add_argument( '--ascii', '--no-unicode', '-U', dest='ascii', action='store_true', help='Use ASCII characters only, which is useful for terminals without Unicode support.', ) coloring = parser.add_argument_group('coloring') coloring.add_argument( '--colorful', dest='colorful', action='store_true', help=( 'Use gradient colors to get spectrum-like bar charts. This option is only available\n' 'when the terminal supports 256 colors. You may need to set environment variable\n' '`TERM="xterm-256color"`. Note that the terminal multiplexer, such as `tmux`, may\n' 'override the `TREM` variable.' ), ) coloring.add_argument( '--force-color', dest='force_color', action='store_true', help='Force colorize even when `stdout` is not a TTY terminal.', ) coloring.add_argument( '--light', action='store_true', help=( 'Tweak visual results for light theme terminals in monitor mode.\n' 'Set variable `NVITOP_MONITOR_MODE="light"` on light terminals for convenience.' ), ) gpu_thresholds = Device.GPU_UTILIZATION_THRESHOLDS coloring.add_argument( '--gpu-util-thresh', type=int, nargs=2, choices=range(1, 100), metavar=('th1', 'th2'), help=( 'Thresholds of GPU utilization to determine the load intensity.\n' 'Coloring rules: {}.\n' '( 1 <= th1 < th2 <= 99, defaults: {} {} )' ).format(coloring_rules, *gpu_thresholds), ) memory_thresholds = Device.MEMORY_UTILIZATION_THRESHOLDS coloring.add_argument( '--mem-util-thresh', type=int, nargs=2, choices=range(1, 100), metavar=('th1', 'th2'), help=( 'Thresholds of GPU memory percent to determine the load intensity.\n' 'Coloring rules: {}.\n' '( 1 <= th1 < th2 <= 99, defaults: {} {} )' ).format(coloring_rules, *memory_thresholds), ) device_filtering = parser.add_argument_group('device filtering') device_filtering.add_argument( '--only', '-o', dest='only', type=int, nargs='+', metavar='INDEX', help='Only show the specified devices, suppress option `--only-visible`.', ) device_filtering.add_argument( '--only-visible', '-ov', dest='only_visible', action='store_true', help='Only show devices in the `CUDA_VISIBLE_DEVICES` environment variable.', ) process_filtering = parser.add_argument_group('process filtering') process_filtering.add_argument( '--compute', '-c', dest='compute', action='store_true', help="Only show GPU processes with the compute context. (type: 'C' or 'C+G')", ) process_filtering.add_argument( '--only-compute', '-C', dest='only_compute', action='store_true', help="Only show GPU processes exactly with the compute context. (type: 'C' only)", ) process_filtering.add_argument( '--graphics', '-g', dest='graphics', action='store_true', help="Only show GPU processes with the graphics context. (type: 'G' or 'C+G')", ) process_filtering.add_argument( '--only-graphics', '-G', dest='only_graphics', action='store_true', help="Only show GPU processes exactly with the graphics context. (type: 'G' only)", ) process_filtering.add_argument( '--user', '-u', dest='user', type=str, nargs='*', metavar='USERNAME', help='Only show processes of the given users (or `$USER` for no argument).', ) process_filtering.add_argument( '--pid', '-p', dest='pid', type=int, nargs='+', metavar='PID', help='Only show processes of the given PIDs.', ) args = parser.parse_args() if args.interval is not None and args.interval < 0.25: parser.error( f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. ' f'Expected 1/4 or higher.', ) if not args.colorful: args.colorful = 'colorful' in NVITOP_MONITOR_MODE and 'plain' not in NVITOP_MONITOR_MODE if not args.light: args.light = 'light' in NVITOP_MONITOR_MODE and 'dark' not in NVITOP_MONITOR_MODE if args.user is not None and len(args.user) == 0: args.user.append(USERNAME) if args.gpu_util_thresh is None: try: gpu_util_thresh = list( map(int, os.getenv('NVITOP_GPU_UTILIZATION_THRESHOLDS', '').split(',')), )[:2] except ValueError: pass else: if ( len(gpu_util_thresh) == 2 and min(gpu_util_thresh) > 0 and max(gpu_util_thresh) < 100 ): args.gpu_util_thresh = gpu_util_thresh if args.mem_util_thresh is None: try: mem_util_thresh = list( map(int, os.getenv('NVITOP_MEMORY_UTILIZATION_THRESHOLDS', '').split(',')), )[:2] except ValueError: pass else: if ( len(mem_util_thresh) == 2 and min(mem_util_thresh) > 0 and max(mem_util_thresh) < 100 ): args.mem_util_thresh = mem_util_thresh return args # pylint: disable-next=too-many-branches,too-many-statements,too-many-locals def main() -> int: """Main function for ``nvitop`` CLI.""" args = parse_arguments() if args.force_color: set_color(True) messages = [] if args.once and hasattr(args, 'monitor'): messages.append('ERROR: Both `--once` and `--monitor` switches are on.') del args.monitor if not args.once and not hasattr(args, 'monitor') and TTY: args.monitor = None if hasattr(args, 'monitor') and not TTY: messages.append('ERROR: You must run monitor mode from a TTY terminal.') del args.monitor if hasattr(args, 'monitor') and args.monitor is None: mode = NVITOP_MONITOR_MODE.intersection({'auto', 'full', 'compact'}) mode = 'auto' if len(mode) != 1 else mode.pop() args.monitor = mode if not setlocale_utf8(): args.ascii = True try: device_count = Device.count() except libnvml.NVMLError_LibraryNotFound: return 1 except libnvml.NVMLError as ex: print( '{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), ex), file=sys.stderr, ) return 1 if args.gpu_util_thresh is not None: Device.GPU_UTILIZATION_THRESHOLDS = tuple(sorted(args.gpu_util_thresh)) if args.mem_util_thresh is not None: Device.MEMORY_UTILIZATION_THRESHOLDS = tuple(sorted(args.mem_util_thresh)) if args.only is not None: indices = set(args.only) invalid_indices = indices.difference(range(device_count)) indices.intersection_update(range(device_count)) if len(invalid_indices) > 1: messages.append(f'ERROR: Invalid device indices: {sorted(invalid_indices)}.') elif len(invalid_indices) == 1: messages.append(f'ERROR: Invalid device index: {next(iter(invalid_indices))}.') elif args.only_visible: indices = { index if isinstance(index, int) else index[0] for index in Device.parse_cuda_visible_devices() } else: indices = set(range(device_count)) devices = Device.from_indices(sorted(indices)) filters = [] if args.compute: filters.append(lambda process: 'C' in process.type or 'X' in process.type) if args.only_compute: filters.append(lambda process: 'G' not in process.type and 'X' not in process.type) if args.graphics: filters.append(lambda process: 'G' in process.type or 'X' in process.type) if args.only_graphics: filters.append(lambda process: 'C' not in process.type and 'X' not in process.type) if args.user is not None: users = set(args.user) filters.append(lambda process: process.username in users) if args.pid is not None: pids = set(args.pid) filters.append(lambda process: process.pid in pids) tui = None if hasattr(args, 'monitor') and len(devices) > 0: try: with libcurses(colorful=args.colorful, light_theme=args.light) as win: tui = TUI( devices, filters, ascii=args.ascii, mode=args.monitor, interval=args.interval, win=win, ) tui.loop() except curses.error as ex: if tui is not None: raise messages.append(f'ERROR: Failed to initialize `curses` ({ex})') if tui is None: tui = TUI(devices, filters, ascii=args.ascii) if not sys.stdout.isatty(): parent = HostProcess().parent() if parent is not None: grandparent = parent.parent() if ( grandparent is not None and parent.name() == 'sh' and grandparent.name() == 'watch' ): messages.append( 'HINT: You are running `nvitop` under `watch` command. ' 'Please try `nvitop -m` directly.', ) tui.print() tui.destroy() if len(libnvml.UNKNOWN_FUNCTIONS) > 0: unknown_function_messages = [ ( 'ERROR: Some FunctionNotFound errors occurred while calling:' if len(libnvml.UNKNOWN_FUNCTIONS) > 1 else 'ERROR: A FunctionNotFound error occurred while calling:' ), ] unknown_function_messages.extend( f' nvmlQuery({(func.__name__ if not isinstance(func, str) else func)!r}, *args, **kwargs)' for func, _ in libnvml.UNKNOWN_FUNCTIONS.values() ) unknown_function_messages.append( ( 'Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version.\n' 'You can check the release history of `nvidia-ml-py` and install the compatible version manually.\n' 'See {} for more information.' ).format( colored('https://github.com/XuehaiPan/nvitop#installation', attrs=('underline',)), ), ) if libnvml._pynvml_installation_corrupted: # pylint: disable=protected-access message = textwrap.dedent( """ WARNING: The `nvidia-ml-py` package is corrupted. Please reinstall it using: pip3 install --force-reinstall nvitop nvidia-ml-py or install `nvitop` in an isolated environment: pip3 install --upgrade pipx pipx run nvitop """, ) messages.append(message.strip() + '\n') if len(messages) > 0: for message in messages: for prefix, color in (('ERROR:', 'red'), ('WARNING:', 'yellow'), ('HINT:', 'green')): if message.startswith(prefix): message = message.replace( prefix, colored(prefix, color=color, attrs=('bold',)), # type: ignore[arg-type] 1, ) break print(message, file=sys.stderr) return 1 return 0 if __name__ == '__main__': sys.exit(main()) nvitop-1.4.2/nvitop/select.py000066400000000000000000000503461474547113600162310ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """CUDA visible devices selection tool. Command line usage: .. code-block:: bash # All devices but sorted nvisel # or use `python3 -m nvitop.select` # A simple example to select 4 devices nvisel -n 4 # or use `python3 -m nvitop.select -n 4` # Select available devices that satisfy the given constraints nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60 # Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel` export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)" # Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)" # Pipe output to other shell utilities nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv # Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form) nvisel -i -S Python API: .. code-block:: python # Put this at the top of the Python script import os from nvitop import select_devices os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( select_devices(format='uuid', min_count=4, min_free_memory='8GiB') ) """ # pylint: disable=line-too-long from __future__ import annotations import argparse import contextlib import math import os import sys import warnings from typing import TYPE_CHECKING, overload from nvitop.api import Device, GpuProcess, Snapshot, colored, host, human2bytes, libnvml from nvitop.version import __version__ if TYPE_CHECKING: from collections.abc import Callable, Iterable, Sequence from typing_extensions import Literal # Python 3.8+ __all__ = ['select_devices'] @overload def select_devices( # pylint: disable=too-many-arguments devices: Iterable[Device] | None, *, format: Literal['index'], # pylint: disable=redefined-builtin force_index: bool, min_count: int, max_count: int | None, min_free_memory: int | str | None, min_total_memory: int | str | None, max_gpu_utilization: int | None, max_memory_utilization: int | None, tolerance: int, free_accounts: list[str] | None, sort: bool, ) -> list[int] | list[tuple[int, int]]: ... @overload def select_devices( # pylint: disable=too-many-arguments devices: Iterable[Device] | None, *, format: Literal['uuid'], # pylint: disable=redefined-builtin force_index: bool, min_count: int, max_count: int | None, min_free_memory: int | str | None, min_total_memory: int | str | None, max_gpu_utilization: int | None, max_memory_utilization: int | None, tolerance: int, free_accounts: list[str] | None, sort: bool, ) -> list[int] | list[tuple[int, int]]: ... @overload def select_devices( # pylint: disable=too-many-arguments devices: Iterable[Device] | None, *, format: Literal['device'], # pylint: disable=redefined-builtin force_index: bool, min_count: int, max_count: int | None, min_free_memory: int | str | None, min_total_memory: int | str | None, max_gpu_utilization: int | None, max_memory_utilization: int | None, tolerance: int, free_accounts: list[str] | None, sort: bool, ) -> list[Device]: ... # pylint: disable-next=too-many-branches,too-many-statements,too-many-locals,too-many-arguments def select_devices( devices: Iterable[Device] | None = None, *, format: Literal['index', 'uuid', 'device'] = 'index', # pylint: disable=redefined-builtin force_index: bool = False, min_count: int = 0, max_count: int | None = None, min_free_memory: int | str | None = None, # in bytes or human readable min_total_memory: int | str | None = None, # in bytes or human readable max_gpu_utilization: int | None = None, # in percentage max_memory_utilization: int | None = None, # in percentage tolerance: int = 0, # in percentage free_accounts: list[str] | None = None, sort: bool = True, ) -> list[int] | list[tuple[int, int]] | list[str] | list[Device]: """Select a subset of devices satisfying the specified criteria. Note: The *min count* constraint may not be satisfied if the no enough devices are available. This constraint is only enforced when there are both MIG and non-MIG devices present. Examples: Put the following lines to the top of your script: .. code-block:: python import os from nvitop import select_devices os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( select_devices(format='uuid', min_count=4, min_free_memory='8GiB') ) Args: devices (Iterable[Device]): The device superset to select from. If not specified, use all devices as the superset. format (str): The format of the output. One of :const:`'index'`, :const:`'uuid'`, or :const:`'device'`. If gets any MIG device with format :const:`'index'` set, falls back to the :const:`'uuid'` format. force_index (bool): If :data:`True`, always use the device index as the output format when gets any MIG device. min_count (int): The minimum number of devices to select. max_count (Optional[int]): The maximum number of devices to select. min_free_memory (Optional[Union[int, str]]): The minimum free memory (an :class:`int` *in bytes* or a :class:`str` in human readable form) of the selected devices. min_total_memory (Optional[Union[int, str]]): The minimum total memory (an :class:`int` *in bytes* or a :class:`str` in human readable form) of the selected devices. max_gpu_utilization (Optional[int]): The maximum GPU utilization rate (*in percentage*) of the selected devices. max_memory_utilization (Optional[int]): The maximum memory bandwidth utilization rate (*in percentage*) of the selected devices. tolerance (int): The tolerance rate (*in percentage*) to loose the constraints. free_accounts (List[str]): A list of accounts whose used GPU memory needs be considered as free memory. sort (bool): If :data:`True`, sort the selected devices by memory usage and GPU utilization. Returns: A list of the device identifiers. """ assert format in {'index', 'uuid', 'device'} assert tolerance >= 0 tolerance = tolerance / 100.0 if max_count is not None: if max_count == 0: return [] assert max_count >= min_count >= 0 free_accounts = set(free_accounts or []) if devices is None: devices = Device.all() if isinstance(min_free_memory, str): min_free_memory = human2bytes(min_free_memory) if isinstance(min_total_memory, str): min_total_memory = human2bytes(min_total_memory) available_devices: list[Snapshot] = [] for device in devices: available_devices.extend(dev.as_snapshot() for dev in device.to_leaf_devices()) for device in available_devices: device.loosen_constraints = 0 # type: ignore[attr-defined] if len(free_accounts) > 0: with GpuProcess.failsafe(): for device in available_devices: as_free_memory = 0 for process in device.real.processes().values(): if process.username() in free_accounts: as_free_memory += process.gpu_memory() device.memory_free += as_free_memory # type: ignore[attr-defined] device.memory_used -= as_free_memory # type: ignore[attr-defined] def filter_func( criteria: Callable[[Snapshot], bool], original_criteria: Callable[[Snapshot], bool], ) -> Callable[[Snapshot], bool]: def wrapped(device: Snapshot) -> bool: device.loosen_constraints += int(not original_criteria(device)) # type: ignore[attr-defined] return criteria(device) return wrapped if min_free_memory is not None: loosen_min_free_memory = min_free_memory * (1.0 - tolerance) available_devices = filter( # type: ignore[assignment] filter_func( lambda device: device.memory_free >= loosen_min_free_memory, lambda device: device.memory_free >= min_free_memory, ), available_devices, ) if min_total_memory is not None: loosen_min_total_memory = min_total_memory * (1.0 - tolerance) available_devices = filter( # type: ignore[assignment] filter_func( lambda device: device.memory_total >= loosen_min_total_memory, lambda device: device.memory_total >= min_total_memory, ), available_devices, ) if max_gpu_utilization is not None: loosen_max_gpu_utilization = max_gpu_utilization + 100.0 * tolerance available_devices = filter( # type: ignore[assignment] filter_func( lambda device: device.gpu_utilization <= loosen_max_gpu_utilization, lambda device: device.gpu_utilization <= max_gpu_utilization, ), available_devices, ) if max_memory_utilization is not None: loosen_max_memory_utilization = max_memory_utilization + 100.0 * tolerance available_devices = filter( # type: ignore[assignment] filter_func( lambda device: device.memory_utilization <= loosen_max_memory_utilization, lambda device: device.memory_utilization <= max_memory_utilization, ), available_devices, ) available_devices = list(available_devices) if sort: available_devices.sort( key=lambda device: ( device.loosen_constraints, (not math.isnan(device.memory_free), -device.memory_free), # descending (not math.isnan(device.memory_used), -device.memory_used), # descending (not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending (not math.isnan(device.memory_utilization), device.memory_utilization), # ascending -device.physical_index, # descending to keep free ), ) if any(device.is_mig_device for device in available_devices): # found MIG devices! non_mig_devices = [device for device in available_devices if not device.is_mig_device] mig_devices = [device for device in available_devices if device.is_mig_device] if len(non_mig_devices) >= min_count > 0 or not available_devices[0].is_mig_device: available_devices = non_mig_devices else: available_devices = mig_devices[:1] # at most one MIG device is visible if format == 'index' and not force_index: format = 'uuid' available_devices = available_devices[:max_count] if format == 'device': return [device.real for device in available_devices] if format == 'uuid': return [device.uuid for device in available_devices] return [device.index for device in available_devices] # pylint: disable-next=too-many-branches,too-many-statements def parse_arguments() -> argparse.Namespace: """Parse command-line arguments for ``nvisel``.""" def non_negint(argstring: str) -> int: num = int(argstring) if num < 0: raise ValueError return num non_negint.__name__ = 'non-negative integer' parser = argparse.ArgumentParser( prog='nvisel', description='CUDA visible devices selection tool.', formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument( '--help', '-h', dest='help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.', ) parser.add_argument( '--version', '-V', dest='version', action='version', version=f'%(prog)s {__version__}', help="Show %(prog)s's version number and exit.", ) constraints = parser.add_argument_group('constraints') constraints.add_argument( '--inherit', '-i', dest='inherit', type=str, default=argparse.SUPPRESS, nargs='?', metavar='CUDA_VISIBLE_DEVICES', help=( 'Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the\n' 'value from the environment. This means selecting a subset of the currently\n' 'CUDA-visible devices.' ), ) constraints.add_argument( '--account-as-free', dest='free_accounts', nargs='*', metavar='USERNAME', help=( 'Account the used GPU memory of the given users as free memory.\n' 'If this option is specified but without argument, `$USER` will be used.' ), ) constraints.add_argument( '--min-count', '-c', dest='min_count', type=non_negint, default=0, metavar='N', help=( 'Minimum number of devices to select. (default: %(default)d)\n' 'The tool will fail (exit non-zero) if the requested resource is not available.' ), ) constraints.add_argument( '--max-count', '-C', dest='max_count', type=non_negint, default=None, metavar='N', help='Maximum number of devices to select. (default: all devices)', ) constraints.add_argument( '--count', '-n', dest='count', type=non_negint, metavar='N', help='Overriding both `--min-count N` and `--max-count N`.', ) constraints.add_argument( '--min-free-memory', '-f', dest='min_free_memory', type=human2bytes, default=None, metavar='SIZE', help=( 'Minimum free memory of devices to select. (example value: 4GiB)\n' 'If this constraint is given, check against all devices.' ), ) constraints.add_argument( '--min-total-memory', '-t', dest='min_total_memory', type=human2bytes, default=None, metavar='SIZE', help=( 'Minimum total memory of devices to select. (example value: 10GiB)\n' 'If this constraint is given, check against all devices.' ), ) constraints.add_argument( '--max-gpu-utilization', '-G', dest='max_gpu_utilization', type=non_negint, default=None, metavar='RATE', help=( 'Maximum GPU utilization rate of devices to select. (example value: 30)\n' 'If this constraint is given, check against all devices.' ), ) constraints.add_argument( '--max-memory-utilization', '-M', dest='max_memory_utilization', type=non_negint, default=None, metavar='RATE', help=( 'Maximum memory bandwidth utilization rate of devices to select. (example value: 50)\n' 'If this constraint is given, check against all devices.' ), ) constraints.add_argument( '--tolerance', '--tol', dest='tolerance', type=non_negint, default=10, metavar='TOL', help=( 'The constraints tolerance (in percentage). (default: 0, i.e., strict)\n' 'This option can loose the constraints if the requested resource is not available.\n' 'For example, set `--tolerance=20` will accept a device with only 4GiB of free\n' 'memory when set `--min-free-memory=5GiB`.' ), ) formatter = parser.add_argument_group('formatting') formatter.add_argument( '--format', '-O', dest='format', type=str, choices=('index', 'uuid'), default='index', metavar='FORMAT', help=( 'The output format of the selected device identifiers. (default: %(default)s)\n' 'If any MIG device found, the output format will be fallback to `uuid`.' ), ) separator = formatter.add_mutually_exclusive_group() separator.add_argument( '--sep', '--separator', '-s', dest='sep', type=str, default=',', metavar='SEP', help='Separator for the output. (default: %(default)r)', ) separator.add_argument( '--newline', dest='newline', action='store_true', help=r"Use newline character as separator for the output, equivalent to `--sep=$'\n'`.", ) separator.add_argument( '--null', '-0', dest='null', action='store_true', help=( "Use null character ('\\x00') as separator for the output. This option corresponds\n" 'to the `-0` option of `xargs`.' ), ) formatter.add_argument( '--no-sort', '-S', dest='sort', action='store_false', help='Do not sort the device by memory usage and GPU utilization.', ) args = parser.parse_args() if args.count is not None: args.min_count = args.max_count = args.count if args.max_count is not None and args.max_count < args.min_count: raise RuntimeError('Max count must be no less than min count.') if args.newline: args.sep = '\n' elif args.null: args.sep = '\0' if args.free_accounts is not None and len(args.free_accounts) == 0: with contextlib.suppress(ImportError, OSError): args.free_accounts.append(host.getuser()) return args def main() -> int: """Main function for ``nvisel`` CLI.""" args = parse_arguments() devices: Sequence[Device] try: if hasattr(args, 'inherit'): if args.inherit is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.inherit devices = Device.from_cuda_visible_devices() else: devices = Device.all() except libnvml.NVMLError_LibraryNotFound: return 1 except libnvml.NVMLError as ex: print( '{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), ex), file=sys.stderr, ) return 2 except RuntimeError as ex: print( '{} {}'.format( colored('CUDA ERROR:', color='red', attrs=('bold',)), str(ex).replace('CUDA Error: ', ''), ), file=sys.stderr, ) return 3 identifiers = select_devices( # type: ignore[call-overload] devices, format=args.format, min_count=args.min_count, max_count=args.max_count, min_free_memory=args.min_free_memory, min_total_memory=args.min_total_memory, max_gpu_utilization=args.max_gpu_utilization, max_memory_utilization=args.max_memory_utilization, tolerance=args.tolerance, free_accounts=args.free_accounts, sort=args.sort, ) identifiers = list(map(str, identifiers)) result = args.sep.join(identifiers) if not sys.stdout.isatty(): print('CUDA_VISIBLE_DEVICES="{}"'.format(','.join(identifiers)), file=sys.stderr) retval = 0 if len(identifiers) < args.min_count: warnings.warn('Not enough devices found.', RuntimeWarning, stacklevel=1) retval = 4 if args.sep == '\0': print(result, end='\0') else: print(result) return retval if __name__ == '__main__': sys.exit(main()) nvitop-1.4.2/nvitop/tui/000077500000000000000000000000001474547113600151715ustar00rootroot00000000000000nvitop-1.4.2/nvitop/tui/COPYING000066400000000000000000001045151474547113600162320ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . nvitop-1.4.2/nvitop/tui/__init__.py000066400000000000000000000004771474547113600173120ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring from nvitop.tui.library import ( SUPERUSER, USERNAME, Device, colored, libcurses, set_color, setlocale_utf8, ) from nvitop.tui.tui import TUI nvitop-1.4.2/nvitop/tui/library/000077500000000000000000000000001474547113600166355ustar00rootroot00000000000000nvitop-1.4.2/nvitop/tui/library/__init__.py000066400000000000000000000022101474547113600207410ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring from nvitop.tui.library.device import NA, Device from nvitop.tui.library.displayable import Displayable, DisplayableContainer from nvitop.tui.library.history import BufferedHistoryGraph, HistoryGraph from nvitop.tui.library.keybinding import ( ALT_KEY, ANYKEY, PASSIVE_ACTION, QUANT_KEY, SPECIAL_KEYS, KeyBuffer, KeyMaps, normalize_keybinding, ) from nvitop.tui.library.libcurses import libcurses, setlocale_utf8 from nvitop.tui.library.messagebox import MessageBox, send_signal from nvitop.tui.library.mouse import MouseEvent from nvitop.tui.library.process import ( GiB, GpuProcess, HostProcess, Snapshot, bytes2human, host, timedelta2human, ) from nvitop.tui.library.selection import Selection from nvitop.tui.library.utils import ( HOSTNAME, LARGE_INTEGER, SUPERUSER, USERCONTEXT, USERNAME, colored, cut_string, make_bar, set_color, ttl_cache, ) from nvitop.tui.library.widestring import WideString, wcslen nvitop-1.4.2/nvitop/tui/library/device.py000066400000000000000000000164171474547113600204570ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring from nvitop.api import NA, libnvml, ttl_cache, utilization2string from nvitop.api import MigDevice as MigDeviceBase from nvitop.api import PhysicalDevice as DeviceBase from nvitop.tui.library.process import GpuProcess __all__ = ['Device', 'NA'] class Device(DeviceBase): GPU_PROCESS_CLASS = GpuProcess MEMORY_UTILIZATION_THRESHOLDS = (10, 80) GPU_UTILIZATION_THRESHOLDS = (10, 75) INTENSITY2COLOR = {'light': 'green', 'moderate': 'yellow', 'heavy': 'red'} SNAPSHOT_KEYS = [ 'name', 'bus_id', 'memory_used', 'memory_free', 'memory_total', 'memory_used_human', 'memory_free_human', 'memory_total_human', 'memory_percent', 'memory_usage', 'gpu_utilization', 'memory_utilization', 'fan_speed', 'temperature', 'power_usage', 'power_limit', 'power_status', 'display_active', 'current_driver_model', 'persistence_mode', 'performance_state', 'total_volatile_uncorrected_ecc_errors', 'compute_mode', 'mig_mode', 'is_mig_device', 'memory_percent_string', 'memory_utilization_string', 'gpu_utilization_string', 'fan_speed_string', 'temperature_string', 'memory_loading_intensity', 'memory_display_color', 'gpu_loading_intensity', 'gpu_display_color', 'loading_intensity', 'display_color', ] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._snapshot = None self.tuple_index = (self.index,) if isinstance(self.index, int) else self.index self.display_index = ':'.join(map(str, self.tuple_index)) def as_snapshot(self): self._snapshot = super().as_snapshot() self._snapshot.tuple_index = self.tuple_index self._snapshot.display_index = self.display_index return self._snapshot @property def snapshot(self): if self._snapshot is None: self.as_snapshot() return self._snapshot def mig_devices(self): mig_devices = [] if self.is_mig_mode_enabled(): for mig_index in range(self.max_mig_device_count()): try: mig_device = MigDevice(index=(self.index, mig_index)) except libnvml.NVMLError: # noqa: PERF203 break else: mig_devices.append(mig_device) return mig_devices fan_speed = ttl_cache(ttl=5.0)(DeviceBase.fan_speed) temperature = ttl_cache(ttl=5.0)(DeviceBase.temperature) power_usage = ttl_cache(ttl=5.0)(DeviceBase.power_usage) display_active = ttl_cache(ttl=5.0)(DeviceBase.display_active) display_mode = ttl_cache(ttl=5.0)(DeviceBase.display_mode) current_driver_model = ttl_cache(ttl=5.0)(DeviceBase.current_driver_model) persistence_mode = ttl_cache(ttl=5.0)(DeviceBase.persistence_mode) performance_state = ttl_cache(ttl=5.0)(DeviceBase.performance_state) total_volatile_uncorrected_ecc_errors = ttl_cache(ttl=5.0)( DeviceBase.total_volatile_uncorrected_ecc_errors, ) compute_mode = ttl_cache(ttl=5.0)(DeviceBase.compute_mode) mig_mode = ttl_cache(ttl=5.0)(DeviceBase.mig_mode) def memory_percent_string(self): # in percentage return utilization2string(self.memory_percent()) def memory_utilization_string(self): # in percentage return utilization2string(self.memory_utilization()) def gpu_utilization_string(self): # in percentage return utilization2string(self.gpu_utilization()) def fan_speed_string(self): # in percentage return utilization2string(self.fan_speed()) def temperature_string(self): # in Celsius temperature = self.temperature() if libnvml.nvmlCheckReturn(temperature, int): temperature = str(temperature) + 'C' return temperature def memory_loading_intensity(self): return self.loading_intensity_of(self.memory_percent(), type='memory') def gpu_loading_intensity(self): return self.loading_intensity_of(self.gpu_utilization(), type='gpu') def loading_intensity(self): loading_intensity = (self.memory_loading_intensity(), self.gpu_loading_intensity()) if 'heavy' in loading_intensity: return 'heavy' if 'moderate' in loading_intensity: return 'moderate' return 'light' def display_color(self): if self.name().startswith('ERROR:'): return 'red' return self.INTENSITY2COLOR.get(self.loading_intensity()) def memory_display_color(self): if self.name().startswith('ERROR:'): return 'red' return self.INTENSITY2COLOR.get(self.memory_loading_intensity()) def gpu_display_color(self): if self.name().startswith('ERROR:'): return 'red' return self.INTENSITY2COLOR.get(self.gpu_loading_intensity()) @staticmethod def loading_intensity_of(utilization, type='memory'): # pylint: disable=redefined-builtin thresholds = { 'memory': Device.MEMORY_UTILIZATION_THRESHOLDS, 'gpu': Device.GPU_UTILIZATION_THRESHOLDS, }.get(type) if utilization is NA: return 'moderate' if isinstance(utilization, str): utilization = utilization.replace('%', '') utilization = float(utilization) if utilization >= thresholds[-1]: return 'heavy' if utilization >= thresholds[0]: return 'moderate' return 'light' @staticmethod def color_of(utilization, type='memory'): # pylint: disable=redefined-builtin return Device.INTENSITY2COLOR.get(Device.loading_intensity_of(utilization, type=type)) class MigDevice(MigDeviceBase, Device): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._snapshot = None self.tuple_index = (self.index,) if isinstance(self.index, int) else self.index self.display_index = ':'.join(map(str, self.tuple_index)) def memory_usage(self) -> str: # string of used memory over total memory (in human readable) return f'{self.memory_used_human()} / {self.memory_total_human():>8s}' loading_intensity = Device.memory_loading_intensity SNAPSHOT_KEYS = [ 'name', 'memory_used', 'memory_free', 'memory_total', 'memory_used_human', 'memory_free_human', 'memory_total_human', 'memory_percent', 'memory_usage', 'bar1_memory_used_human', 'bar1_memory_percent', 'gpu_utilization', 'memory_utilization', 'total_volatile_uncorrected_ecc_errors', 'mig_mode', 'is_mig_device', 'gpu_instance_id', 'compute_instance_id', 'memory_percent_string', 'memory_utilization_string', 'gpu_utilization_string', 'memory_loading_intensity', 'memory_display_color', 'gpu_loading_intensity', 'gpu_display_color', 'loading_intensity', 'display_color', ] nvitop-1.4.2/nvitop/tui/library/displayable.py000066400000000000000000000176721474547113600215150ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # This file is originally part of ranger, the console file manager. https://github.com/ranger/ranger # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-function-docstring from nvitop.tui.library.libcurses import CursesShortcuts class Displayable(CursesShortcuts): # pylint: disable=too-many-instance-attributes """Displayables are objects which are displayed on the screen. This is just the abstract class, defining basic operations such as resizing, printing, changing colors. Subclasses of displayable can extend these methods: draw() -- draw the object. Is only called if visible. poke() -- is called just before draw(), even if not visible. finalize() -- called after all objects finished drawing. press(key) -- called after a key press on focused objects. destroy() -- called before destroying the displayable object Additionally, there are these methods: __contains__(item) -- is the item (y, x) inside the panel? These attributes are set: Modifiable: focused -- Focused objects receive press() calls. visible -- Visible objects receive draw() and finalize() calls need_redraw -- Should the panel be redrawn? This variable may be set at various places in the script and should eventually be handled (and unset) in the draw() method. Read-Only: (i.e. recommended not to change manually) win -- the own curses window object parent -- the parent (DisplayableContainer) object or None x, y, width, height -- absolute coordinates and boundaries """ def __init__(self, win, root=None): super().__init__() self._need_redraw = True self.focused = False self._old_visible = self._visible = True self.x = 0 self.y = 0 self._width = 0 self.height = 0 self.win = win self.root = root self.parent = None def __contains__(self, item): """Check if item is inside the boundaries. item can be an iterable like [y, x] or an object with x and y methods. """ try: y, x = item.y, item.x except AttributeError: try: y, x = item except (ValueError, TypeError): return False return self.contains_point(y, x) def contains_point(self, y, x): """Test whether the point lies inside this object. x and y should be absolute coordinates. """ return (self.x <= x < self.x + self.width) and (self.y <= y < self.y + self.height) def poke(self): """Called before drawing, even if invisible.""" if self._old_visible != self.visible: self._old_visible = self.visible self.need_redraw = True if not self.visible: self.win.erase() def draw(self): """Draw the object. Called on every main iteration if visible. Containers should call draw() on their contained objects here. Override this! """ self.need_redraw = False def finalize(self): """Called after every displayable is done drawing. Override this! """ self.need_redraw = False def destroy(self): """Called when the object is destroyed.""" self.win = None self.root = None def click(self, event): """Called when a mouse key is pressed and self.focused is True. Override this! """ def press(self, key): """Called when a key is pressed and self.focused is True. Override this! """ @property def visible(self): return self._visible @visible.setter def visible(self, value): if self._visible != value: self.need_redraw = True self._visible = value if not self.visible: self.focused = False @property def need_redraw(self): return self._need_redraw @need_redraw.setter def need_redraw(self, value): if self._need_redraw != value: self._need_redraw = value if value and self.parent is not None and not self.parent.need_redraw: self.parent.need_redraw = True @property def width(self): return self._width @width.setter def width(self, value): if self.width != value and self.visible: self.need_redraw = True self._width = value def __str__(self): return self.__class__.__name__ class DisplayableContainer(Displayable): """DisplayableContainers are Displayables which contain other Displayables. This is also an abstract class. The methods draw, poke, finalize, click, press and destroy are extended here and will recursively call the function on all contained objects. New methods: add_child(object) -- add the object to the container. replace_child(old_obj, new_obj) -- replaces old object with new object. remove_child(object) -- remove the object from the container. New attributes: container -- a list with all contained objects (rw) """ def __init__(self, win, root=None): super().__init__(win, root) self.container = [] # extended or overridden methods def poke(self): """Recursively called on objects in container.""" super().poke() for displayable in self.container: displayable.poke() def draw(self): """Recursively called on visible objects in container.""" for displayable in self.container: if self.need_redraw: displayable.need_redraw = True if displayable.visible: displayable.draw() self.need_redraw = False def finalize(self): """Recursively called on visible objects in container.""" for displayable in self.container: if displayable.visible: displayable.finalize() def destroy(self): """Recursively called on objects in container.""" for displayable in self.container: displayable.destroy() super().destroy() def press(self, key): """Recursively called on objects in container.""" focused_obj = self.get_focused_obj() if focused_obj: focused_obj.press(key) return True return False def click(self, event): """Recursively called on objects in container.""" focused_obj = self.get_focused_obj() if focused_obj and focused_obj.click(event): return True return any( displayable.visible and event in displayable and displayable.click(event) for displayable in self.container ) # new methods def add_child(self, obj): """Add the objects to the container.""" if obj.parent is not None: obj.parent.remove_child(obj) self.container.append(obj) obj.parent = self obj.root = self.root def replace_child(self, old_obj, new_obj): """Replace the old object with the new instance in the container.""" self.container[self.container.index(old_obj)] = new_obj new_obj.parent = self new_obj.root = self.root def remove_child(self, obj): """Remove the object from the container.""" try: self.container.remove(obj) except ValueError: pass else: obj.parent = None obj.root = None def get_focused_obj(self): # Finds a focused displayable object in the container. for displayable in self.container: if displayable.focused: return displayable try: obj = displayable.get_focused_obj() except AttributeError: pass else: if obj is not None: return obj return None nvitop-1.4.2/nvitop/tui/library/history.py000066400000000000000000000302441474547113600207130ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import functools import itertools import threading import time from collections import deque from nvitop.api import NA BOUND_UPDATE_INTERVAL = 1.0 # fmt: off VALUE2SYMBOL_UP = { (0, 0): ' ', (0, 1): '⢀', (0, 2): '⢠', (0, 3): '⢰', (0, 4): '⢸', (1, 0): '⡀', (1, 1): '⣀', (1, 2): '⣠', (1, 3): '⣰', (1, 4): '⣸', (2, 0): '⡄', (2, 1): '⣄', (2, 2): '⣤', (2, 3): '⣴', (2, 4): '⣼', (3, 0): '⡆', (3, 1): '⣆', (3, 2): '⣦', (3, 3): '⣶', (3, 4): '⣾', (4, 0): '⡇', (4, 1): '⣇', (4, 2): '⣧', (4, 3): '⣷', (4, 4): '⣿', } VALUE2SYMBOL_DOWN = { (0, 0): ' ', (0, 1): '⠈', (0, 2): '⠘', (0, 3): '⠸', (0, 4): '⢸', (1, 0): '⠁', (1, 1): '⠉', (1, 2): '⠙', (1, 3): '⠹', (1, 4): '⢹', (2, 0): '⠃', (2, 1): '⠋', (2, 2): '⠛', (2, 3): '⠻', (2, 4): '⢻', (3, 0): '⠇', (3, 1): '⠏', (3, 2): '⠟', (3, 3): '⠿', (3, 4): '⢿', (4, 0): '⡇', (4, 1): '⡏', (4, 2): '⡟', (4, 3): '⡿', (4, 4): '⣿', } # fmt: on SYMBOL2VALUE_UP = {v: k for k, v in VALUE2SYMBOL_UP.items()} SYMBOL2VALUE_DOWN = {v: k for k, v in VALUE2SYMBOL_DOWN.items()} PAIR2SYMBOL_UP = { (s1, s2): VALUE2SYMBOL_UP[SYMBOL2VALUE_UP[s1][-1], SYMBOL2VALUE_UP[s2][0]] for s1, s2 in itertools.product(SYMBOL2VALUE_UP, repeat=2) } PAIR2SYMBOL_DOWN = { (s1, s2): VALUE2SYMBOL_DOWN[SYMBOL2VALUE_DOWN[s1][-1], SYMBOL2VALUE_DOWN[s2][0]] for s1, s2 in itertools.product(SYMBOL2VALUE_DOWN, repeat=2) } GRAPH_SYMBOLS = ''.join( sorted(set(itertools.chain(VALUE2SYMBOL_UP.values(), VALUE2SYMBOL_DOWN.values()))), ).replace(' ', '') def grouped(iterable, size, fillvalue=None): yield from itertools.zip_longest(*([iter(iterable)] * size), fillvalue=fillvalue) class HistoryGraph: # pylint: disable=too-many-instance-attributes MAX_WIDTH = 1024 # pylint: disable-next=too-many-arguments def __init__( self, upperbound, width, height, *, format='{:.1f}'.format, # pylint: disable=redefined-builtin max_format=None, baseline=0.0, dynamic_bound=False, min_bound=None, init_bound=None, upsidedown=False, ): assert baseline < upperbound self.format = format if max_format is None: max_format = format self.max_format = max_format if dynamic_bound: if min_bound is None: min_bound = baseline + 0.1 * (upperbound - baseline) if init_bound is None: init_bound = upperbound else: assert min_bound is None assert init_bound is None min_bound = init_bound = upperbound self.baseline = baseline self.min_bound = min_bound self.max_bound = upperbound self.bound = init_bound self.next_bound_update_at = time.monotonic() self._width = width self._height = height self.maxlen = 2 * self.width + 1 self.history = deque( [self.baseline - 0.1] * (2 * self.MAX_WIDTH + 1), maxlen=(2 * self.MAX_WIDTH + 1), ) self.reversed_history = deque([self.baseline - 0.1] * self.maxlen, maxlen=self.maxlen) self._max_value_maintainer = deque([self.baseline - 0.1] * self.maxlen, maxlen=self.maxlen) self.last_retval = None self.graph = [] self.last_graph = [] self.upsidedown = upsidedown if upsidedown: self.value2symbol = VALUE2SYMBOL_DOWN self.pair2symbol = PAIR2SYMBOL_DOWN else: self.value2symbol = VALUE2SYMBOL_UP self.pair2symbol = PAIR2SYMBOL_UP self.write_lock = threading.Lock() self.remake_lock = threading.Lock() self.remake_graph() @property def width(self): return self._width @width.setter def width(self, value): if self._width != value: assert isinstance(value, int) assert value >= 1 self._width = value with self.write_lock: self.maxlen = 2 * self.width + 1 self.reversed_history = deque( (self.baseline - 0.1,) * self.maxlen, maxlen=self.maxlen, ) self._max_value_maintainer = deque( (self.baseline - 0.1,) * self.maxlen, maxlen=self.maxlen, ) for history in itertools.islice( self.history, max(0, self.history.maxlen - self.maxlen), self.history.maxlen, ): if self.reversed_history[-1] == self._max_value_maintainer[0]: self._max_value_maintainer.popleft() while ( len(self._max_value_maintainer) > 0 and self._max_value_maintainer[-1] < history ): self._max_value_maintainer.pop() self.reversed_history.appendleft(history) self._max_value_maintainer.append(history) self.remake_graph() @property def height(self): return self._height @height.setter def height(self, value): if self._height != value: assert isinstance(value, int) assert value >= 1 self._height = value self.remake_graph() @property def graph_size(self): return (self.width, self.height) @graph_size.setter def graph_size(self, value): width, height = value assert isinstance(width, int) assert width >= 1 assert isinstance(height, int) assert height >= 1 self._height = height self._width = width - 1 # trigger force remake self.width = width @property def last_value(self): return self.reversed_history[0] @property def max_value(self): return self._max_value_maintainer[0] def last_value_string(self): last_value = self.last_value if last_value >= self.baseline: return self.format(last_value) try: return self.format(NA) except ValueError: return NA __str__ = last_value_string def max_value_string(self): max_value = self.max_value if max_value >= self.baseline: return self.max_format(max_value) try: return self.max_format(NA) except ValueError: return NA def add(self, value): if value is NA: value = self.baseline - 0.1 if not isinstance(value, (int, float)): return with self.write_lock: if self.reversed_history[-1] == self._max_value_maintainer[0]: self._max_value_maintainer.popleft() while len(self._max_value_maintainer) > 0 and self._max_value_maintainer[-1] < value: self._max_value_maintainer.pop() self.reversed_history.appendleft(value) self._max_value_maintainer.append(value) self.history.append(value) new_bound = self.baseline + 1.25 * (self.max_value - self.baseline) new_bound = min(max(new_bound, self.min_bound), self.max_bound) timestamp = time.monotonic() if new_bound != self.bound and self.next_bound_update_at <= timestamp: self.bound = new_bound self.remake_graph() self.next_bound_update_at = timestamp + BOUND_UPDATE_INTERVAL return self.graph, self.last_graph = self.last_graph, self.graph bar = self.make_bar(self.reversed_history[1], value) # pylint: disable=disallowed-name for i, (line, char) in enumerate(zip(self.graph, bar)): self.graph[i] = (line + char)[-self.width :] def remake_graph(self): with self.remake_lock: if self.max_value >= self.baseline: reversed_bars = [] for _, (value2, value1) in zip( range(self.width), grouped(self.reversed_history, size=2, fillvalue=self.baseline), ): reversed_bars.append(self.make_bar(value1, value2)) graph = list(map(''.join, zip(*reversed(reversed_bars)))) for i, line in enumerate(graph): graph[i] = line.rjust(self.width)[-self.width :] self.graph = graph self.last_graph = list(map(self.shift_line, self.graph)) else: self.graph = [' ' * self.width for _ in range(self.height)] self.last_graph = [' ' * (self.width - 1) for _ in range(self.height)] def make_bar(self, value1, value2): if self.bound <= self.baseline: return [' '] * self.height value1 = self.height * min((value1 - self.baseline) / (self.bound - self.baseline), 1.0) value2 = self.height * min((value2 - self.baseline) / (self.bound - self.baseline), 1.0) if value1 >= 0.0: value1 = max(value1, 0.2) if value2 >= 0.0: value2 = max(value2, 0.2) # pylint: disable=disallowed-name,invalid-name bar = [] for h in range(self.height): s1 = min(max(round(5 * (value1 - h)), 0), 4) s2 = min(max(round(5 * (value2 - h)), 0), 4) bar.append(self.value2symbol[s1, s2]) if not self.upsidedown: bar.reverse() return bar def shift_line(self, line): return ''.join(map(self.pair2symbol.get, zip(line, line[1:]))) def __getitem__(self, item): return self.reversed_history[item] def hook(self, func, get_value=None): @functools.wraps(func) def wrapped(*args, **kwargs): self.last_retval = retval = value = func(*args, **kwargs) if get_value is not None: value = get_value(retval) self.add(value) return retval wrapped.history = self return wrapped __call__ = hook class BufferedHistoryGraph(HistoryGraph): # pylint: disable-next=too-many-arguments def __init__( self, upperbound, width, height, *, format='{:.1f}'.format, # pylint: disable=redefined-builtin max_format=None, baseline=0.0, dynamic_bound=False, upsidedown=False, min_bound=None, init_bound=None, interval=1.0, ): assert interval > 0.0 super().__init__( upperbound, width, height, format=format, max_format=max_format, baseline=baseline, dynamic_bound=dynamic_bound, min_bound=min_bound, init_bound=init_bound, upsidedown=upsidedown, ) self.interval = interval self.start_time = time.monotonic() self.last_update_time = self.start_time self.buffer = [] @property def last_value(self): last_value = super().last_value if last_value < self.baseline and len(self.buffer) > 0: return sum(self.buffer) / len(self.buffer) return last_value def add(self, value): if value is NA: value = self.baseline - 0.1 if not isinstance(value, (int, float)): return timestamp = time.monotonic() timedelta = timestamp - self.last_update_time if len(self.buffer) > 0 and timedelta >= self.interval: new_value = sum(self.buffer) / len(self.buffer) self.buffer.clear() last_value = self.reversed_history[0] if last_value >= self.baseline: n_interval = int(timedelta / self.interval) for i in range(1, n_interval): super().add(last_value + (i / n_interval) * (new_value - last_value)) super().add(new_value) self.last_update_time += (timedelta // self.interval) * self.interval self.buffer.append(value) nvitop-1.4.2/nvitop/tui/library/keybinding.py000066400000000000000000000265451474547113600213460ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # This file is originally part of ranger, the console file manager. https://github.com/ranger/ranger # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import copy import curses import curses.ascii import string from collections import OrderedDict DIGITS = set(map(ord, string.digits)) # Arbitrary numbers which are not used with curses.KEY_XYZ ANYKEY, PASSIVE_ACTION, ALT_KEY, QUANT_KEY = range(9001, 9005) SPECIAL_KEYS = OrderedDict( [ ('BS', curses.KEY_BACKSPACE), ('Backspace', curses.KEY_BACKSPACE), # overrides in REVERSED_SPECIAL_KEYS ('Backspace2', curses.ascii.DEL), ('Delete', curses.KEY_DC), ('S-Delete', curses.KEY_SDC), ('Insert', curses.KEY_IC), ('CR', ord('\n')), ('Return', ord('\n')), ('Enter', ord('\n')), # overrides and in REVERSED_SPECIAL_KEYS ('Space', ord(' ')), ('Escape', curses.ascii.ESC), ('Esc', curses.ascii.ESC), # overrides in REVERSED_SPECIAL_KEYS ('Down', curses.KEY_DOWN), ('Up', curses.KEY_UP), ('Left', curses.KEY_LEFT), ('Right', curses.KEY_RIGHT), ('PageDown', curses.KEY_NPAGE), ('PageUp', curses.KEY_PPAGE), ('Home', curses.KEY_HOME), ('End', curses.KEY_END), ('Tab', ord('\t')), ('S-Tab', curses.KEY_BTAB), ('lt', ord('<')), ('gt', ord('>')), ], ) NAMED_SPECIAL_KEYS = tuple(SPECIAL_KEYS.keys()) SPECIAL_KEYS_UNCASED = {} VERY_SPECIAL_KEYS = { 'Alt': ALT_KEY, 'any': ANYKEY, 'bg': PASSIVE_ACTION, 'allow_quantifiers': QUANT_KEY, } def _uncase_special_key(key_string): """Uncase a special key. >>> _uncase_special_key('Esc') 'esc' >>> _uncase_special_key('C-X') 'c-x' >>> _uncase_special_key('C-x') 'c-x' >>> _uncase_special_key('A-X') 'a-X' >>> _uncase_special_key('A-x') 'a-x' """ uncased = key_string.lower() if len(uncased) == 3 and (uncased.startswith(('a-', 'm-'))): uncased = f'{uncased[0]}-{key_string[-1]}' return uncased def _special_keys_init(): for key, val in tuple(SPECIAL_KEYS.items()): SPECIAL_KEYS['M-' + key] = (ALT_KEY, val) SPECIAL_KEYS['A-' + key] = (ALT_KEY, val) # overrides in REVERSED_SPECIAL_KEYS for char in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!{}[],./': SPECIAL_KEYS['M-' + char] = (ALT_KEY, ord(char)) SPECIAL_KEYS['A-' + char] = (ALT_KEY, ord(char)) # overrides in REVERSED_SPECIAL_KEYS # We will need to reorder the keys of SPECIAL_KEYS below. # For example, will override in REVERSE_SPECIAL_KEYS, # this makes construct_keybinding(parse_keybinding('')) == '' for char in 'abcdefghijklmnopqrstuvwxyz_': SPECIAL_KEYS['C-' + char] = ord(char) - 96 SPECIAL_KEYS['C-Space'] = 0 for n in range(64): SPECIAL_KEYS['F' + str(n)] = curses.KEY_F0 + n SPECIAL_KEYS.update(VERY_SPECIAL_KEYS) # noqa: F821 # Reorder the keys of SPECIAL_KEYS. for key in NAMED_SPECIAL_KEYS: # noqa: F821 SPECIAL_KEYS.move_to_end(key, last=True) for key, val in SPECIAL_KEYS.items(): SPECIAL_KEYS_UNCASED[_uncase_special_key(key)] = val _special_keys_init() del _special_keys_init, VERY_SPECIAL_KEYS, NAMED_SPECIAL_KEYS REVERSED_SPECIAL_KEYS = OrderedDict([(v, k) for k, v in SPECIAL_KEYS.items()]) def parse_keybinding(obj): # pylint: disable=too-many-branches r"""Translate a keybinding to a sequence of integers The letter case of special keys in the keybinding string will be ignored. >>> out = tuple(parse_keybinding('lol')) >>> out (108, 111, 108, 10) >>> out == (ord('l'), ord('o'), ord('l'), ord('\n')) True >>> out = tuple(parse_keybinding('x')) >>> out (120, 9003, 260) >>> out == (ord('x'), ALT_KEY, curses.KEY_LEFT) True """ assert isinstance(obj, (tuple, int, str)) if isinstance(obj, tuple): yield from obj elif isinstance(obj, int): # pylint: disable=too-many-nested-blocks yield obj else: # pylint: disable=too-many-nested-blocks in_brackets = False bracket_content = [] for char in obj: if in_brackets: if char == '>': in_brackets = False key_string = ''.join(bracket_content) try: keys = SPECIAL_KEYS_UNCASED[_uncase_special_key(key_string)] yield from keys except KeyError: if key_string.isdigit(): yield int(key_string) else: yield ord('<') for bracket_char in bracket_content: yield ord(bracket_char) yield ord('>') except TypeError: yield keys # it was no tuple, just an int else: bracket_content.append(char) elif char == '<': in_brackets = True bracket_content = [] else: yield ord(char) if in_brackets: yield ord('<') for char in bracket_content: yield ord(char) def key_to_string(key): if key in range(33, 127): return chr(key) if key in REVERSED_SPECIAL_KEYS: return f'<{REVERSED_SPECIAL_KEYS[key]}>' return f'<{key}>' def construct_keybinding(keys): """Do the reverse of parse_keybinding. >>> construct_keybinding(parse_keybinding('lol')) 'lol' >>> construct_keybinding(parse_keybinding('x')) 'x' >>> construct_keybinding(parse_keybinding('x')) 'x' """ try: keys = tuple(keys) except TypeError: assert isinstance(keys, int) keys = (keys,) strings = [] alt_key_on = False for key in keys: if key == ALT_KEY: alt_key_on = True continue if alt_key_on: try: strings.append(f'<{REVERSED_SPECIAL_KEYS[ALT_KEY, key]}>') except KeyError: strings.extend(map(key_to_string, (ALT_KEY, key))) else: strings.append(key_to_string(key)) alt_key_on = False return ''.join(strings) def normalize_keybinding(keybinding): """Normalize a keybinding to a string. >>> normalize_keybinding('lol') 'lol' >>> normalize_keybinding('x') 'x' >>> normalize_keybinding('x') 'x' """ return construct_keybinding(parse_keybinding(keybinding)) class KeyMaps(dict): def __init__(self, keybuffer=None): super().__init__() self.keybuffer = keybuffer self.used_keymap = None def use_keymap(self, keymap_name): self.keybuffer.keymap = self.get(keymap_name, {}) if self.used_keymap != keymap_name: self.used_keymap = keymap_name self.keybuffer.clear() def clear_keymap(self, keymap_name): self[keymap_name] = {} if self.used_keymap == keymap_name: self.keybuffer.keymap = {} self.keybuffer.clear() def _clean_input(self, context, keys): try: pointer = self[context] except KeyError: self[context] = pointer = {} keys = keys.encode('utf-8').decode('latin-1') return list(parse_keybinding(keys)), pointer def bind(self, context, keys, leaf): keys, pointer = self._clean_input(context, keys) if not keys: return last_key = keys[-1] for key in keys[:-1]: if key in pointer and isinstance(pointer[key], dict): pointer = pointer[key] else: pointer = pointer[key] = {} pointer[last_key] = leaf def copy(self, context, source, target): clean_source, pointer = self._clean_input(context, source) if not source: return for key in clean_source: try: pointer = pointer[key] except KeyError as ex: # noqa: PERF203 raise KeyError( f'Tried to copy the keybinding `{source}`, but it was not found.', ) from ex try: self.bind(context, target, copy.deepcopy(pointer)) except TypeError: self.bind(context, target, pointer) def unbind(self, context, keys): keys, pointer = self._clean_input(context, keys) if not keys: return self._unbind_traverse(pointer, keys) @staticmethod def _unbind_traverse(pointer, keys, pos=0): if keys[pos] not in pointer: return if len(keys) > pos + 1 and isinstance(pointer, dict): KeyMaps._unbind_traverse(pointer[keys[pos]], keys, pos=pos + 1) if not pointer[keys[pos]]: del pointer[keys[pos]] elif len(keys) == pos + 1: try: del pointer[keys[pos]] except KeyError: pass try: keys.pop() except IndexError: pass class KeyBuffer: # pylint: disable=too-many-instance-attributes any_key = ANYKEY passive_key = PASSIVE_ACTION quantifier_key = QUANT_KEY excluded_from_anykey = [curses.ascii.ESC] def __init__(self, keymap=None): self.keymap = keymap self.keys = [] self.wildcards = [] self.pointer = self.keymap self.result = None self.quantifier = None self.finished_parsing_quantifier = False self.finished_parsing = False self.parse_error = False if ( self.keymap and self.quantifier_key in self.keymap and self.keymap[self.quantifier_key] == 'false' ): self.finished_parsing_quantifier = True def clear(self): self.__init__(self.keymap) # pylint: disable=unnecessary-dunder-call def add(self, key): self.keys.append(key) self.result = None if not self.finished_parsing_quantifier and key in DIGITS: if self.quantifier is None: self.quantifier = 0 self.quantifier = self.quantifier * 10 + key - 48 # (48 = ord('0')) else: self.finished_parsing_quantifier = True moved = True if key in self.pointer: self.pointer = self.pointer[key] elif self.any_key in self.pointer and key not in self.excluded_from_anykey: self.wildcards.append(key) self.pointer = self.pointer[self.any_key] else: moved = False if moved: if isinstance(self.pointer, dict): if self.passive_key in self.pointer: self.result = self.pointer[self.passive_key] else: self.result = self.pointer self.finished_parsing = True else: self.finished_parsing = True self.parse_error = True def __str__(self): return construct_keybinding(self.keys) nvitop-1.4.2/nvitop/tui/library/libcurses.py000066400000000000000000000211471474547113600212070ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import colorsys import contextlib import curses import locale import os import signal from nvitop.tui.library.history import GRAPH_SYMBOLS LIGHT_THEME = False DEFAULT_FOREGROUND = curses.COLOR_WHITE DEFAULT_BACKGROUND = curses.COLOR_BLACK COLOR_PAIRS = {None: 0} TRUE_COLORS = dict( [ ('black', 0), ('red', 1), ('green', 2), ('yellow', 3), ('blue', 4), ('magenta', 5), ('cyan', 6), ('white', 7), ('bright black', 8), ('bright red', 9), ('bright green', 10), ('bright yellow', 11), ('bright blue', 12), ('bright magenta', 13), ('bright cyan', 14), ('bright white', 15), ] + [(f'preserved {i:02d}', i) for i in range(16, 64)], ) BASE_ATTR = 0 def _init_color_theme(light_theme=False): """Set the default fg/bg colors.""" global LIGHT_THEME, DEFAULT_FOREGROUND, DEFAULT_BACKGROUND # pylint: disable=global-statement LIGHT_THEME = light_theme if LIGHT_THEME: DEFAULT_FOREGROUND = curses.COLOR_BLACK DEFAULT_BACKGROUND = curses.COLOR_WHITE else: DEFAULT_FOREGROUND = curses.COLOR_WHITE DEFAULT_BACKGROUND = curses.COLOR_BLACK def _colormap(x, levels=160): # pylint: disable=invalid-name h = 0.5 * (1.0 - x) - 0.15 h = (round(h * levels) / levels) % 1.0 r, g, b = colorsys.hsv_to_rgb(h, 0.7, 0.8) return (round(1000.0 * r), round(1000.0 * g), round(1000.0 * b)) def _get_true_color(rgb): if rgb not in TRUE_COLORS: try: curses.init_color(len(TRUE_COLORS), *rgb) except curses.error: return -1 TRUE_COLORS[rgb] = len(TRUE_COLORS) return TRUE_COLORS[rgb] def _get_color(fg, bg): """Return the curses color pair for the given fg/bg combination.""" global COLOR_PAIRS # pylint: disable=global-statement,global-variable-not-assigned if isinstance(fg, str): fg = getattr(curses, f'COLOR_{fg.upper()}', -1) elif isinstance(fg, tuple): fg = _get_true_color(fg) elif isinstance(fg, float): fg = _get_true_color(_colormap(fg)) if isinstance(bg, str): bg = getattr(curses, f'COLOR_{bg.upper()}', -1) elif isinstance(bg, tuple): bg = _get_true_color(bg) elif isinstance(bg, float): bg = _get_true_color(_colormap(bg)) key = (fg, bg) if key not in COLOR_PAIRS: size = len(COLOR_PAIRS) try: curses.init_pair(size, fg, bg) except curses.error: # If curses.use_default_colors() failed during the initialization # of curses, then using -1 as fg or bg will fail as well, which # we need to handle with fallback-defaults: if fg == -1: # -1 is the "default" color fg = DEFAULT_FOREGROUND if bg == -1: # -1 is the "default" color bg = DEFAULT_BACKGROUND try: curses.init_pair(size, fg, bg) except curses.error: # If this fails too, colors are probably not supported pass COLOR_PAIRS[key] = size return COLOR_PAIRS[key] def setlocale_utf8(): for code in ('C.UTF-8', 'en_US.UTF-8', '', 'C'): try: code = locale.setlocale(locale.LC_ALL, code) except locale.Error: # noqa: PERF203 continue else: if 'utf8' in code.lower() or 'utf-8' in code.lower(): return True return False @contextlib.contextmanager def libcurses(colorful=False, light_theme=False): os.environ.setdefault('ESCDELAY', '25') setlocale_utf8() win = curses.initscr() win.nodelay(True) win.leaveok(True) win.keypad(True) curses.noecho() curses.cbreak() curses.curs_set(False) curses.mousemask(curses.ALL_MOUSE_EVENTS | curses.REPORT_MOUSE_POSITION) curses.mouseinterval(0) curses.ungetmouse(0, 0, 0, 0, 0) _init_color_theme(light_theme) curses.start_color() try: curses.use_default_colors() except curses.error: pass if colorful: try: CursesShortcuts.TERM_256COLOR = curses.COLORS >= 256 except AttributeError: pass # Push a Ctrl+C (ascii value 3) to the curses getch stack def interrupt_handler(signalnum, frame): # pylint: disable=unused-argument curses.ungetch(3) # Simulate a ^C press in curses when an interrupt is caught signal.signal(signal.SIGINT, interrupt_handler) try: yield win finally: curses.endwin() class CursesShortcuts: """This class defines shortcuts to facilitate operations with curses. color(*keys) -- sets the color associated with the keys from the current colorscheme. color_at(y, x, width, *keys) -- sets the color at the given position color_reset() -- resets the color to the default addstr(*args) -- failsafe version of self.win.addstr(*args) """ ASCII_TRANSTABLE = str.maketrans( '═─╴╒╤╕╪╘╧╛┌┬┐┼└┴┘│╞╡├┤▏▎▍▌▋▊▉█░▲▼␤' + GRAPH_SYMBOLS, '=--++++++++++++++||||||||||||||^v?' + '=' * len(GRAPH_SYMBOLS), ) TERM_256COLOR = False def __init__(self): self.win = None # type: curses._CursesWindow self.ascii = False def addstr(self, *args, **kwargs): if self.ascii: args = [ arg.translate(self.ASCII_TRANSTABLE) if isinstance(arg, str) else arg for arg in args ] try: self.win.addstr(*args, **kwargs) except curses.error: pass def addnstr(self, *args, **kwargs): if self.ascii: args = [ arg.translate(self.ASCII_TRANSTABLE) if isinstance(arg, str) else arg for arg in args ] try: self.win.addnstr(*args, **kwargs) except curses.error: pass def addch(self, *args, **kwargs): if self.ascii: args = [ arg.translate(self.ASCII_TRANSTABLE) if isinstance(arg, str) else arg for arg in args ] try: self.win.addch(*args, **kwargs) except curses.error: pass def color(self, fg=-1, bg=-1, attr=0): """Change the colors from now on.""" return self.set_fg_bg_attr(fg, bg, attr) def color_reset(self): """Change the colors to the default colors.""" return self.color() def color_at(self, y, x, width, *args, **kwargs): """Change the colors at the specified position.""" try: self.win.chgat(y, x, width, self.get_fg_bg_attr(*args, **kwargs)) except curses.error: pass @staticmethod def get_fg_bg_attr(fg=-1, bg=-1, attr=0): """Return the curses attribute for the given fg/bg/attr combination.""" if fg == -1 and bg == -1 and attr == 0: return BASE_ATTR if isinstance(attr, str): attr_strings = map(str.strip, attr.split('|')) attr = 0 for s in attr_strings: attr |= getattr(curses, f'A_{s.upper()}', 0) # Tweak for light themes if ( LIGHT_THEME and attr & curses.A_REVERSE != 0 and bg == -1 and fg not in {DEFAULT_FOREGROUND, -1} ): bg = DEFAULT_FOREGROUND if fg == -1 and bg == -1: return attr | BASE_ATTR return curses.color_pair(_get_color(fg, bg)) | attr | BASE_ATTR def set_fg_bg_attr(self, fg=-1, bg=-1, attr=0): try: attr = self.get_fg_bg_attr(fg, bg, attr) self.win.attrset(attr) except curses.error: return 0 return attr def update_size(self, termsize=None): if termsize is not None: return termsize self.update_lines_cols() return self.win.getmaxyx() @staticmethod def update_lines_cols(): curses.update_lines_cols() @staticmethod def beep(): curses.beep() @staticmethod def flash(): curses.flash() @staticmethod def set_base_attr(attr=0): global BASE_ATTR # pylint: disable=global-statement if isinstance(attr, str): attr_strings = map(str.strip, attr.split('|')) attr = 0 for s in attr_strings: attr |= getattr(curses, f'A_{s.upper()}', 0) BASE_ATTR = attr nvitop-1.4.2/nvitop/tui/library/messagebox.py000066400000000000000000000306721474547113600213540ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # This file is originally part of ranger, the console file manager. https://github.com/ranger/ranger # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import curses import string import threading import time from functools import partial from nvitop.tui.library.displayable import Displayable from nvitop.tui.library.keybinding import normalize_keybinding from nvitop.tui.library.process import host from nvitop.tui.library.utils import cut_string from nvitop.tui.library.widestring import WideString DIGITS = set(string.digits) class MessageBox(Displayable): # pylint: disable=too-many-instance-attributes class Option: # pylint: disable=too-few-public-methods # pylint: disable-next=too-many-arguments def __init__(self, name, key, callback, *, keys=(), attrs=()): self.name = WideString(name) self.offset = 0 self.key = normalize_keybinding(key) self.callback = callback self.keys = tuple({normalize_keybinding(key) for key in keys}.difference({self.key})) self.attrs = attrs def __str__(self): return str(self.name) # pylint: disable-next=too-many-arguments def __init__(self, message, options, *, default, yes, no, cancel, win, root): super().__init__(win, root) if default is None: default = 0 if no is None: no = cancel self.options = options self.num_options = len(self.options) assert cancel is not None assert self.num_options >= 2 assert 0 <= no < self.num_options assert 0 <= cancel < self.num_options assert 0 <= default < self.num_options self.previous_focused = None self.message = message self.previous_keymap = root.keymaps.used_keymap self.current = default self.yes = yes self.cancel = cancel self.no = no # pylint: disable=invalid-name self.timestamp = time.monotonic() self.name_len = max(8, *(len(option.name) for option in options)) for option in self.options: option.offset = (self.name_len - len(option.name)) // 2 option.name = option.name.center(self.name_len) self.xy_mouse = None self.x, self.y = root.x, root.y self.width = (self.name_len + 6) * self.num_options + 6 self.init_keybindings() lines = [] for msg in self.message.splitlines(): words = iter(map(WideString, msg.split())) try: lines.append(next(words)) except StopIteration: lines.append('') continue for word in words: if len(lines[-1]) + len(word) + 1 <= self.width - 6: lines[-1] += ' ' + word else: lines[-1] = lines[-1].strip() lines.append(word) if len(lines) == 1: lines[-1] = WideString(lines[-1]).center(self.width - 6) lines = [f' │ {line.ljust(self.width - 6)} │ ' for line in lines] lines = [ ' ╒' + '═' * (self.width - 4) + '╕ ', ' │' + ' ' * (self.width - 4) + '│ ', *lines, ' │' + ' ' * (self.width - 4) + '│ ', ' │ ' + ' '.join(['┌' + '─' * (self.name_len + 2) + '┐'] * self.num_options) + ' │ ', ' │ ' + ' '.join(map('│ {} │'.format, self.options)) + ' │ ', ' │ ' + ' '.join(['└' + '─' * (self.name_len + 2) + '┘'] * self.num_options) + ' │ ', ' ╘' + '═' * (self.width - 4) + '╛ ', ] self.lines = lines @property def current(self): return self._current @current.setter def current(self, value): self._current = value self.timestamp = time.monotonic() def draw(self): self.set_base_attr(attr=0) self.color_reset() n_term_lines, n_term_cols = self.root.termsize height = len(self.lines) y_start, x_start = (n_term_lines - height) // 2, (n_term_cols - self.width) // 2 y_option_start = y_start + height - 3 for y, line in enumerate(self.lines, start=y_start): self.addstr(y, x_start, line) for i, option in enumerate(self.options): x_option_start = x_start + 6 + i * (self.name_len + 6) + option.offset for attr in option.attrs: attr = attr.copy() y = y_option_start + attr.pop('y') x = x_option_start + attr.pop('x') self.color_at(y, x, **attr) if self.xy_mouse is not None: x, y = self.xy_mouse if y_option_start - 1 <= y <= y_option_start + 1: current = (x - x_start - 3) // (self.name_len + 6) x_option_start = x_start + 6 + current * (self.name_len + 6) if ( 0 <= current < self.num_options and x_option_start - 3 <= x < x_option_start + self.name_len + 3 ): self.apply(current, wait=True) option = self.options[self.current] x_option_start = x_start + 6 + self.current * (self.name_len + 6) for y in range(y_option_start - 1, y_option_start + 2): self.color_at( y, x_option_start - 3, width=self.name_len + 6, attr='standout | bold', ) for attr in option.attrs: attr = attr.copy() y = y_option_start + attr.pop('y') x = x_option_start + option.offset + attr.pop('x') attr['fg'], attr['bg'] = attr.get('bg', -1), attr.get('fg', -1) attr['attr'] = self.get_fg_bg_attr(attr=attr.get('attr', 0)) attr['attr'] |= self.get_fg_bg_attr(attr='standout | bold') self.color_at(y, x, **attr) def finalize(self): self.xy_mouse = None super().finalize() def press(self, key): self.root.keymaps.use_keymap('messagebox') self.root.press(key) def click(self, event): if event.pressed(1) or event.pressed(3) or event.clicked(1) or event.clicked(3): self.xy_mouse = (event.x, event.y) return True direction = event.wheel_direction() self.current = (self.current + direction) % self.num_options return True def apply(self, index=None, wait=None): if index is None: index = self.current assert 0 <= index < self.num_options if (index != self.current and wait is None) or wait: self.current = index def confirm(): time.sleep(0.25) curses.ungetch(curses.KEY_ENTER) threading.Thread(name='messagebox-confirm', target=confirm, daemon=True).start() return callback = self.options[index].callback if callback is not None: callback() self.root.keymaps.clear_keymap('messagebox') self.root.keymaps.use_keymap(self.previous_keymap) self.root.need_redraw = True self.root.messagebox = None def init_keybindings(self): # pylint: disable=too-many-branches def select_previous(): self.current = (self.current - 1) % self.num_options def select_next(): self.current = (self.current + 1) % self.num_options keymaps = self.root.keymaps keymaps.clear_keymap('messagebox') for i, option in enumerate(self.options): keymaps.bind('messagebox', option.key, partial(self.apply, index=i)) for key in option.keys: keymaps.copy('messagebox', option.key, key) keymaps['messagebox'][keymaps.keybuffer.quantifier_key] = 'false' if len(DIGITS.intersection(keymaps['messagebox'])) == 0 and self.num_options <= 9: for key_n, option in zip('123456789', self.options): keymaps.copy('messagebox', option.key, key_n) assert ( len({'', '', '', ''}.intersection(keymaps['messagebox'])) == 0 ) if self.yes is not None and 'y' not in keymaps['messagebox']: keymaps.copy('messagebox', self.options[self.yes].key, 'y') if 'Y' not in keymaps['messagebox']: keymaps.copy('messagebox', self.options[self.yes].key, 'Y') if self.no is not None and 'n' not in keymaps['messagebox']: keymaps.copy('messagebox', self.options[self.no].key, 'n') if 'N' not in keymaps['messagebox']: keymaps.copy('messagebox', self.options[self.no].key, 'N') if self.cancel is not None: keymaps.bind('messagebox', '', partial(self.apply, index=self.cancel, wait=False)) if 'q' not in keymaps['messagebox'] and 'Q' not in keymaps['messagebox']: keymaps.copy('messagebox', '', 'q') keymaps.copy('messagebox', '', 'Q') keymaps.bind('messagebox', '', self.apply) if '' not in keymaps['messagebox']: keymaps.copy('messagebox', '', '') keymaps.bind('messagebox', '', select_previous) keymaps.bind('messagebox', '', select_next) if ',' not in keymaps['messagebox'] and '.' not in keymaps['messagebox']: keymaps.copy('messagebox', '', ',') keymaps.copy('messagebox', '', '.') if '<' not in keymaps['messagebox'] and '>' not in keymaps['messagebox']: keymaps.copy('messagebox', '', '<') keymaps.copy('messagebox', '', '>') if '[' not in keymaps['messagebox'] and ']' not in keymaps['messagebox']: keymaps.copy('messagebox', '', '[') keymaps.copy('messagebox', '', ']') if '' not in keymaps['messagebox'] and '' not in keymaps['messagebox']: keymaps.copy('messagebox', '', '') keymaps.copy('messagebox', '', '') def send_signal(signal, panel): assert signal in {'terminate', 'kill', 'interrupt'} default = {'terminate': 0, 'kill': 1, 'interrupt': 2}.get(signal) processes = [] for process in panel.selection.processes(): try: username = process.username() except host.PsutilError: username = 'N/A' username = cut_string(username, maxlen=24, padstr='+') processes.append(f'{process.pid}({username})') if len(processes) == 0: return if len(processes) == 1: message = f'Send signal to process {processes[0]}?' else: maxlen = max(map(len, processes)) processes = [process.ljust(maxlen) for process in processes] message = 'Send signal to the following processes?\n\n{}'.format(' '.join(processes)) # pylint: disable=use-dict-literal panel.root.messagebox = MessageBox( message=message, options=[ MessageBox.Option( 'SIGTERM', 't', panel.selection.terminate, keys=('T',), attrs=( {'y': 0, 'x': 0, 'width': 7, 'fg': 'red'}, {'y': 0, 'x': 3, 'width': 1, 'fg': 'red', 'attr': 'bold | underline'}, ), ), MessageBox.Option( 'SIGKILL', 'k', panel.selection.kill, keys=('K',), attrs=( {'y': 0, 'x': 0, 'width': 7, 'fg': 'red'}, {'y': 0, 'x': 3, 'width': 1, 'fg': 'red', 'attr': 'bold | underline'}, ), ), MessageBox.Option( 'SIGINT', 'i', panel.selection.interrupt, keys=('I',), attrs=( {'y': 0, 'x': 0, 'width': 6, 'fg': 'red'}, {'y': 0, 'x': 3, 'width': 1, 'fg': 'red', 'attr': 'bold | underline'}, ), ), MessageBox.Option( 'Cancel', 'c', None, keys=('C',), attrs=({'y': 0, 'x': 0, 'width': 1, 'attr': 'bold | underline'},), ), ], default=default, yes=None, no=3, cancel=3, win=panel.win, root=panel.root, ) # pylint: enable=use-dict-literal nvitop-1.4.2/nvitop/tui/library/mouse.py000066400000000000000000000064031474547113600203420ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # This file is originally part of ranger, the console file manager. https://github.com/ranger/ranger # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import curses class MouseEvent: PRESSED = [ 0, curses.BUTTON1_PRESSED, curses.BUTTON2_PRESSED, curses.BUTTON3_PRESSED, curses.BUTTON4_PRESSED, ] RELEASED = [ 0, curses.BUTTON1_RELEASED, curses.BUTTON2_RELEASED, curses.BUTTON3_RELEASED, curses.BUTTON4_RELEASED, ] CLICKED = [ 0, curses.BUTTON1_CLICKED, curses.BUTTON2_CLICKED, curses.BUTTON3_CLICKED, curses.BUTTON4_CLICKED, ] DOUBLE_CLICKED = [ 0, curses.BUTTON1_DOUBLE_CLICKED, curses.BUTTON2_DOUBLE_CLICKED, curses.BUTTON3_DOUBLE_CLICKED, curses.BUTTON4_DOUBLE_CLICKED, ] CTRL_SCROLLWHEEL_MULTIPLIER = 5 def __init__(self, state): """Create a MouseEvent object from the result of win.getmouse().""" _, self.x, self.y, _, self.bstate = state # x-values above ~220 suddenly became negative, apparently # it's sufficient to add 0xFF to fix that error. if self.x < 0: self.x += 0xFF if self.y < 0: self.y += 0xFF def pressed(self, n): """Return whether the mouse key n is pressed.""" try: return (self.bstate & MouseEvent.PRESSED[n]) != 0 except IndexError: return False def released(self, n): """Return whether the mouse key n is released.""" try: return (self.bstate & MouseEvent.RELEASED[n]) != 0 except IndexError: return False def clicked(self, n): """Return whether the mouse key n is clicked.""" try: return (self.bstate & MouseEvent.CLICKED[n]) != 0 except IndexError: return False def double_clicked(self, n): """Return whether the mouse key n is double clicked.""" try: return (self.bstate & MouseEvent.DOUBLE_CLICKED[n]) != 0 except IndexError: return False def wheel_direction(self): """Return the direction of the scroll action, 0 if there was none.""" # If the bstate > ALL_MOUSE_EVENTS, it's an invalid mouse button. # I interpret invalid buttons as "scroll down" because all tested # systems have a broken curses implementation and this is a workaround. # Recently it seems to have been fixed, as 2**21 was introduced as # the code for the "scroll down" button. if self.pressed(4): return -self.CTRL_SCROLLWHEEL_MULTIPLIER if self.ctrl() else -1 if self.pressed(2) or (self.bstate & (1 << 21)) or self.key_invalid(): return self.CTRL_SCROLLWHEEL_MULTIPLIER if self.ctrl() else 1 return 0 def ctrl(self): return self.bstate & curses.BUTTON_CTRL def alt(self): return self.bstate & curses.BUTTON_ALT def shift(self): return self.bstate & curses.BUTTON_SHIFT def key_invalid(self): return self.bstate > curses.ALL_MOUSE_EVENTS nvitop-1.4.2/nvitop/tui/library/process.py000066400000000000000000000072061474547113600206720ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring from nvitop.api import ( NA, GiB, HostProcess, Snapshot, bytes2human, host, timedelta2human, utilization2string, ) from nvitop.api import GpuProcess as GpuProcessBase __all__ = [ 'host', 'HostProcess', 'GpuProcess', 'NA', 'Snapshot', 'bytes2human', 'GiB', 'timedelta2human', ] class GpuProcess(GpuProcessBase): def __new__(cls, *args, **kwargs): instance = super().__new__(cls, *args, **kwargs) instance._snapshot = None return instance @property def snapshot(self) -> Snapshot: if self._snapshot is None: self.as_snapshot() return self._snapshot def host_snapshot(self) -> Snapshot: host_snapshot = super().host_snapshot() if host_snapshot.cpu_percent is NA: host_snapshot.cpu_percent_string = NA elif host_snapshot.cpu_percent < 1000.0: host_snapshot.cpu_percent_string = f'{host_snapshot.cpu_percent:.1f}%' elif host_snapshot.cpu_percent < 10000: host_snapshot.cpu_percent_string = f'{int(host_snapshot.cpu_percent)}%' else: host_snapshot.cpu_percent_string = '9999+%' if host_snapshot.memory_percent is NA: host_snapshot.memory_percent_string = NA else: host_snapshot.memory_percent_string = f'{host_snapshot.memory_percent:.1f}%' return host_snapshot def as_snapshot(self, *, host_process_snapshot_cache=None) -> Snapshot: snapshot = super().as_snapshot(host_process_snapshot_cache=host_process_snapshot_cache) snapshot.type = snapshot.type.replace('C+G', 'X') if snapshot.gpu_memory_human is NA and (host.WINDOWS or host.WSL): snapshot.gpu_memory_human = 'WDDM:N/A' snapshot.cpu_percent_string = snapshot.host.cpu_percent_string snapshot.memory_percent_string = snapshot.host.memory_percent_string if snapshot.is_running: snapshot.is_zombie = snapshot.cmdline == ['Zombie Process'] snapshot.no_permissions = snapshot.cmdline == ['No Permissions'] snapshot.is_gone = False else: snapshot.is_zombie = False snapshot.no_permissions = False snapshot.is_gone = snapshot.cmdline == ['No Such Process'] snapshot.gpu_memory_percent_string = self.gpu_memory_percent_string() snapshot.gpu_sm_utilization_string = self.gpu_sm_utilization_string() snapshot.gpu_memory_utilization_string = self.gpu_memory_utilization_string() snapshot.gpu_encoder_utilization_string = self.gpu_encoder_utilization_string() snapshot.gpu_decoder_utilization_string = self.gpu_decoder_utilization_string() self._snapshot = snapshot # pylint: disable=attribute-defined-outside-init return snapshot def gpu_memory_percent_string(self) -> str: # in percentage return utilization2string(self.gpu_memory_percent()) def gpu_sm_utilization_string(self) -> str: # in percentage return utilization2string(self.gpu_sm_utilization()) def gpu_memory_utilization_string(self) -> str: # in percentage return utilization2string(self.gpu_memory_utilization()) def gpu_encoder_utilization_string(self) -> str: # in percentage return utilization2string(self.gpu_encoder_utilization()) def gpu_decoder_utilization_string(self) -> str: # in percentage return utilization2string(self.gpu_decoder_utilization()) nvitop-1.4.2/nvitop/tui/library/selection.py000066400000000000000000000112271474547113600211770ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import signal import time from weakref import WeakValueDictionary from nvitop.api import NA, Snapshot, host from nvitop.tui.library.utils import LARGE_INTEGER, SUPERUSER, USERNAME class Selection: # pylint: disable=too-many-instance-attributes def __init__(self, panel): self.tagged = WeakValueDictionary() self.panel = panel self.index = None self.within_window = True self._process = None self._username = None self._ident = None @property def identity(self): if self._ident is None: self._ident = self.process._ident # pylint: disable=protected-access return self._ident @property def process(self): return self._process @process.setter def process(self, process): if isinstance(process, Snapshot): process = process.real self._process = process self._ident = None @property def pid(self): try: return self.identity[0] except TypeError: return None @property def username(self): if self._username is None: try: self._username = self.process.username() except host.PsutilError: self._username = NA return self._username def move(self, direction=0): if direction == 0: return processes = self.panel.snapshots old_index = self.index if len(processes) > 0: if not self.is_set(): if abs(direction) < LARGE_INTEGER: self.index = 0 if direction > 0 else len(processes) - 1 else: self.index = len(processes) - 1 if direction > 0 else 0 else: self.index = min(max(0, self.index + direction), len(processes) - 1) self.process = processes[self.index] if old_index is not None: direction -= self.index - old_index else: direction = 0 if direction != 0 and self.panel.NAME == 'process': self.panel.parent.move(direction) else: self.clear() def owned(self): if not self.is_set(): return False if SUPERUSER: return True return self.username == USERNAME def tag(self): if self.is_set(): try: del self.tagged[self.pid] except KeyError: self.tagged[self.pid] = self.process def processes(self): if len(self.tagged) > 0: return tuple(sorted(self.tagged.values(), key=lambda p: p.pid)) if self.owned() and self.within_window: return (self.process,) return () def foreach(self, func): flag = False for process in self.processes(): try: func(process) except host.PsutilError: # noqa: PERF203 pass else: flag = True if flag: time.sleep(0.25) self.clear() def send_signal(self, sig): self.foreach(lambda process: process.send_signal(sig)) def interrupt(self): try: self.send_signal( ( signal.SIGINT if not host.WINDOWS else signal.CTRL_C_EVENT # pylint: disable=no-member ), ) except SystemError: pass def terminate(self): self.foreach(lambda process: process.terminate()) def kill(self): self.foreach(lambda process: process.kill()) def reset(self): self.index = None self.within_window = True self._process = None self._username = None self._ident = None def clear(self): self.tagged.clear() self.reset() def is_set(self): return self.process is not None __bool__ = is_set def is_same(self, process): try: return self.identity == process._ident # pylint: disable=protected-access except (AttributeError, TypeError): pass return False __eq__ = is_same def is_same_on_host(self, process): try: return self.identity[:2] == process._ident[:2] # pylint: disable=protected-access except (AttributeError, TypeError): pass return False def is_tagged(self, process): return process.pid in self.tagged nvitop-1.4.2/nvitop/tui/library/utils.py000066400000000000000000000051531474547113600203530ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-function-docstring import contextlib import math import os from nvitop.api import NA, colored, host, set_color, ttl_cache from nvitop.tui.library.widestring import WideString __all__ = [ 'NA', 'USERNAME', 'HOSTNAME', 'SUPERUSER', 'USERCONTEXT', 'LARGE_INTEGER', 'ttl_cache', 'colored', 'set_color', 'cut_string', 'make_bar', ] USERNAME = 'N/A' with contextlib.suppress(ImportError, OSError): USERNAME = host.getuser() SUPERUSER = False with contextlib.suppress(AttributeError, OSError): if host.WINDOWS: import ctypes SUPERUSER = bool(ctypes.windll.shell32.IsUserAnAdmin()) else: try: SUPERUSER = os.geteuid() == 0 except AttributeError: SUPERUSER = os.getuid() == 0 HOSTNAME = host.hostname() if host.WSL: HOSTNAME = f'{HOSTNAME} (WSL)' USERCONTEXT = f'{USERNAME}@{HOSTNAME}' LARGE_INTEGER = 65536 def cut_string(s, maxlen, padstr='...', align='left'): assert align in {'left', 'right'} if not isinstance(s, str): s = str(s) s = WideString(s) padstr = WideString(padstr) if len(s) <= maxlen: return str(s) if len(padstr) >= maxlen: return str(padstr[:maxlen]) if align == 'left': return str(s[: maxlen - len(padstr)] + padstr) return str(padstr + s[-(maxlen - len(padstr)) :]) # pylint: disable=disallowed-name def make_bar(prefix, percent, width, *, extra_text=''): bar = f'{prefix}: ' if percent != NA and not (isinstance(percent, float) and not math.isfinite(percent)): if isinstance(percent, str) and percent.endswith('%'): percent = percent.replace('%', '') percent = float(percent) if '.' in percent else int(percent) percentage = max(0.0, min(float(percent) / 100.0, 1.0)) quotient, remainder = divmod(max(1, round(8 * (width - len(bar) - 4) * percentage)), 8) bar += '█' * quotient if remainder > 0: bar += ' ▏▎▍▌▋▊▉'[remainder] if isinstance(percent, float) and len(f'{bar} {percent:.1f}%') <= width: text = f'{percent:.1f}%' else: text = f'{min(round(percent), 100):d}%'.replace('100%', 'MAX') else: bar += '░' * (width - len(bar) - 4) text = 'N/A' if extra_text and len(f'{bar} {text} {extra_text}') <= width: return f'{bar} {text}'.ljust(width - len(extra_text) - 1) + f' {extra_text}' return f'{bar} {text}'.ljust(width) nvitop-1.4.2/nvitop/tui/library/widestring.py000066400000000000000000000160071474547113600213720ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # This file is originally part of ranger, the console file manager. https://github.com/ranger/ranger # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring from unicodedata import east_asian_width ASCIIONLY = set(map(chr, range(1, 128))) NARROW = 1 WIDE = 2 WIDE_SYMBOLS = set('WF') def utf_char_width(string): """Return the width of a single character.""" if east_asian_width(string) in WIDE_SYMBOLS: return WIDE return NARROW def string_to_charlist(string): """Return a list of characters with extra empty strings after wide chars.""" if ASCIIONLY.issuperset(string): return list(string) result = [] for char in string: result.append(char) if east_asian_width(char) in WIDE_SYMBOLS: result.append('') return result def wcslen(string): """Return the length of a string with wide chars.""" return len(WideString(string)) class WideString: # pylint: disable=too-few-public-methods,wrong-spelling-in-docstring def __init__(self, string='', chars=None): if isinstance(string, WideString): string = string.string try: self.string = str(string) except UnicodeEncodeError: self.string = string.encode('latin-1', 'ignore') if chars is None: self.chars = string_to_charlist(string) else: self.chars = chars def __add__(self, other): """ >>> (WideString('a') + WideString('b')).string 'ab' >>> (WideString('a') + WideString('b')).chars ['a', 'b'] >>> (WideString('afd') + 'bc').chars ['a', 'f', 'd', 'b', 'c'] """ if isinstance(other, str): return WideString(self.string + other) if isinstance(other, WideString): return WideString(self.string + other.string, self.chars + other.chars) return NotImplemented def __radd__(self, other): """ >>> ('bc' + WideString('afd')).chars ['b', 'c', 'a', 'f', 'd'] """ if isinstance(other, str): return WideString(other + self.string) if isinstance(other, WideString): return WideString(other.string + self.string, other.chars + self.chars) return NotImplemented def __iadd__(self, other): new = self + other self.string = new.string self.chars = new.chars return self def __str__(self): return self.string def __repr__(self): return f'<{self.__class__.__name__} {self.string!r}>' def __eq__(self, other): if not isinstance(other, (str, WideString)): raise TypeError return str(self) == str(other) def __hash__(self): return hash(self.string) def __getitem__(self, item): """ >>> WideString('asdf')[2] >>> WideString('……')[0] >>> WideString('……')[1] >>> WideString('asdf')[1:3] >>> WideString('asdf')[1:-100] >>> WideString('モヒカン')[2:4] >>> WideString('モヒカン')[2:5] >>> WideString('モabカン')[2:5] >>> WideString('モヒカン')[1:5] >>> WideString('モヒカン')[:] >>> WideString('aモ')[0:3] >>> WideString('aモ')[0:2] >>> WideString('aモ')[0:1] """ if isinstance(item, slice): assert item.step is None or item.step == 1 start, stop = item.start, item.stop else: assert isinstance(item, int) start, stop = item, item + 1 length = len(self) if stop is None or stop > length: stop = length if stop < 0: stop = max(0, length + stop) if start is None: start = 0 if start < 0: start = max(0, length + start) if start >= length or start >= stop: return WideString('') if stop < length and self.chars[stop] == '': if self.chars[start] == '': return WideString(' ' + ''.join(self.chars[start : stop - 1]) + ' ') return WideString(''.join(self.chars[start : stop - 1]) + ' ') if self.chars[start] == '': return WideString(' ' + ''.join(self.chars[start : stop - 1])) return WideString(''.join(self.chars[start:stop])) def __len__(self): """ >>> len(WideString('poo')) 3 >>> len(WideString('モヒカン')) 8 """ return len(self.chars) def ljust(self, width, fillchar=' '): """ >>> WideString('poo').ljust(2) >>> WideString('poo').ljust(5) >>> WideString('モヒカン').ljust(10) """ if width > len(self): return WideString(self.string + fillchar * width)[:width] return self def rjust(self, width, fillchar=' '): """ >>> WideString('poo').rjust(2) >>> WideString('poo').rjust(5) >>> WideString('モヒカン').rljust(10) """ if width > len(self): return WideString(fillchar * width + self.string)[-width:] return self def center(self, width, fillchar=' '): """ >>> WideString('poo').center(2) >>> WideString('poo').center(5) >>> WideString('モヒカン').center(10) """ if width > len(self): left_width = (width - len(self)) // 2 right_width = width - left_width return WideString(fillchar * left_width + self.string + fillchar * right_width)[:width] return self def strip(self, chars=None): """ >>> WideString(' poo ').strip() >>> WideString(' モヒカン ').strip() """ return WideString(self.string.strip(chars)) def lstrip(self, chars=None): """ >>> WideString(' poo ').lstrip() >>> WideString(' モヒカン ').lstrip() """ return WideString(self.string.lstrip(chars)) def rstrip(self, chars=None): """ >>> WideString(' poo ').rstrip() >>> WideString(' モヒカン ').rstrip() """ return WideString(self.string.rstrip(chars)) nvitop-1.4.2/nvitop/tui/screens/000077500000000000000000000000001474547113600166335ustar00rootroot00000000000000nvitop-1.4.2/nvitop/tui/screens/__init__.py000066400000000000000000000006461474547113600207520ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring from nvitop.tui.screens.environ import EnvironScreen from nvitop.tui.screens.help import HelpScreen from nvitop.tui.screens.main import BreakLoop, MainScreen from nvitop.tui.screens.metrics import ProcessMetricsScreen from nvitop.tui.screens.treeview import TreeViewScreen nvitop-1.4.2/nvitop/tui/screens/environ.py000066400000000000000000000204401474547113600206650ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring from collections import OrderedDict from functools import partial from itertools import islice from nvitop.tui.library import Displayable, GpuProcess, HostProcess, WideString, host class EnvironScreen(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'environ' def __init__(self, win, root): super().__init__(win, root) self.this = HostProcess() self._process = None self._environ = None self.items = None self.username = None self.command = None self.x_offset = 0 self._y_offset = 0 self.scroll_offset = 0 self.y_mouse = None self._height = 0 self.x, self.y = root.x, root.y self.width, self.height = root.width, root.height @property def process(self): return self._process @process.setter def process(self, value): if value is None: value = self.this self._process = value with self.process.oneshot(): try: self.environ = self.process.environ().copy() except host.PsutilError: self.environ = None try: self.command = self.process.command() except host.PsutilError: self.command = 'N/A' try: self.username = self.process.username() except host.PsutilError: self.username = 'N/A' self.command = WideString(self.command) self.username = WideString(self.username) @property def environ(self): return self._environ @environ.setter def environ(self, value): newline = '␤' if not self.root.ascii else '?' def normalize(s): return s.replace('\n', newline) if value is not None: self.items = [ (WideString(key), WideString(f'{key}={normalize(value[key])}')) for key in sorted(value.keys()) ] value = OrderedDict(self.items) else: self.items = None self._environ = value self.x_offset = 0 self.y_offset = 0 self.scroll_offset = 0 @property def height(self): return self._height @height.setter def height(self, value): self._height = value try: self.y_offset = self.y_offset except AttributeError: pass @property def display_height(self): return self.height - 2 @property def y_offset(self): return self._y_offset @y_offset.setter def y_offset(self, value): if self.environ is None: self._y_offset = 0 self.scroll_offset = 0 return n_items = len(self.environ) self._y_offset = max(0, min(value, n_items - 1)) if n_items <= self.scroll_offset + self.display_height: self.scroll_offset = max(0, n_items - self.display_height) elif self.y_offset > self.scroll_offset + self.display_height - 1: self.scroll_offset = self.y_offset - self.display_height + 1 self.scroll_offset = min(self.scroll_offset, self.y_offset) def move(self, direction, wheel=False): if self.environ is not None and wheel: n_items = len(self.environ) old_scroll_offset = self.scroll_offset self.scroll_offset = max( 0, min(self.scroll_offset + direction, n_items - self.display_height), ) direction -= self.scroll_offset - old_scroll_offset self._y_offset += self.scroll_offset - old_scroll_offset self.y_offset += direction def update_size(self, termsize=None): n_term_lines, n_term_cols = termsize = super().update_size(termsize=termsize) self.width = n_term_cols - self.x self.height = n_term_lines - self.y return termsize def draw(self): self.color_reset() if isinstance(self.process, GpuProcess): process_type = 'GPU: ' + self.process.type.replace('C', 'Compute').replace( 'G', 'Graphics', ) else: process_type = 'Host' header_prefix = WideString( f'Environment of process {self.process.pid} ({self.username}@{process_type}): ', ) offset = max(0, min(self.x_offset, len(self.command) + len(header_prefix) - self.width)) header = str((header_prefix + self.command[offset:]).ljust(self.width)[: self.width]) self.addstr(self.y, self.x, header) self.addstr(self.y + 1, self.x, '#' * self.width) self.color_at(self.y, self.x, width=len(header_prefix) - 1, fg='cyan', attr='bold') self.color_at(self.y + 1, self.x, width=self.width, fg='green', attr='bold') if self.environ is None: self.addstr(self.y + 2, self.x, 'Could not read process environment.') self.color_at(self.y + 2, self.x, width=self.width, fg='cyan', attr='reverse') return items = islice(self.items, self.scroll_offset, self.scroll_offset + self.display_height) for y, (key, line) in enumerate(items, start=self.y + 2): key_length = len(key) line = str(line[self.x_offset :].ljust(self.width)[: self.width]) self.addstr(y, self.x, line) if self.x_offset < key_length: self.color_at(y, self.x, width=key_length - self.x_offset, fg='blue', attr='bold') if self.x_offset < key_length + 1: self.color_at(y, self.x + key_length - self.x_offset, width=1, fg='magenta') if y == self.y_mouse: self.y_offset = y - (self.y + 2 - self.scroll_offset) if y == self.y + 2 - self.scroll_offset + self.y_offset: self.color_at(y, self.x, width=self.width, fg='cyan', attr='bold | reverse') def finalize(self): self.y_mouse = None super().finalize() def press(self, key): self.root.keymaps.use_keymap('environ') self.root.press(key) def click(self, event): if event.pressed(1) or event.pressed(3) or event.clicked(1) or event.clicked(3): self.y_mouse = event.y return True direction = event.wheel_direction() if event.shift(): self.x_offset = max(0, self.x_offset + 2 * direction) else: self.move(direction=direction, wheel=True) return True def init_keybindings(self): def refresh_environ(): self.process = self.root.previous_screen.selection.process self.need_redraw = True def environ_left(): self.x_offset = max(0, self.x_offset - 5) def environ_right(): self.x_offset += 5 def environ_begin(): self.x_offset = 0 def environ_move(direction): self.move(direction=direction) keymaps = self.root.keymaps keymaps.bind('environ', 'r', refresh_environ) keymaps.copy('environ', 'r', 'R') keymaps.copy('environ', 'r', '') keymaps.copy('environ', 'r', '') keymaps.bind('environ', '', environ_left) keymaps.copy('environ', '', '') keymaps.bind('environ', '', environ_right) keymaps.copy('environ', '', '') keymaps.bind('environ', '', environ_begin) keymaps.copy('environ', '', '^') keymaps.bind('environ', '', partial(environ_move, direction=-1)) keymaps.copy('environ', '', '') keymaps.copy('environ', '', '') keymaps.copy('environ', '', '') keymaps.copy('environ', '', '[') keymaps.bind('environ', '', partial(environ_move, direction=+1)) keymaps.copy('environ', '', '') keymaps.copy('environ', '', '') keymaps.copy('environ', '', '') keymaps.copy('environ', '', ']') keymaps.bind('environ', '', partial(environ_move, direction=-(1 << 20))) keymaps.bind('environ', '', partial(environ_move, direction=+(1 << 20))) nvitop-1.4.2/nvitop/tui/screens/help.py000066400000000000000000000107221474547113600201370ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring from nvitop.tui.library import Device, Displayable, MouseEvent from nvitop.version import __version__ HELP_TEMPLATE = """nvitop {} - (C) Xuehai Pan, 2021-2025. Released under the GNU GPLv3 License. GPU Process Type: C: Compute, G: Graphics, X: Mixed. Device coloring rules by loading intensity: - GPU utilization: light < {:2d}% <= moderate < {:2d}% <= heavy. - GPU-MEM percent: light < {:2d}% <= moderate < {:2d}% <= heavy. a f c: change display mode h ?: show this help screen F5 r: force refresh window q: quit Arrows: scroll process list Space: tag/untag current process Home: select the first process Esc: clear process selection End: select the last process Ctrl-C I: interrupt selected process K: kill selected process Ctrl-A ^: scroll to left most T: terminate selected process Ctrl-E $: scroll to right most e: show process environment PageUp [: scroll entire screen up t: toggle tree-view screen PageDown ]: scroll entire screen down Enter: show process metrics Wheel: scroll process list Shift-Wheel: scroll horizontally Tab: scroll process list Ctrl-Wheel: fast scroll ({}x) on oN: sort by GPU-INDEX os oS: sort by %SM op oP: sort by PID oc oC: sort by %CPU ou oU: sort by USER om oM: sort by %MEM og oG: sort by GPU-MEM ot oT: sort by TIME , .: select sort column /: invert sort order Press any key to return. """ class HelpScreen(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'help' def __init__(self, win, root): super().__init__(win, root) self.infos = ( HELP_TEMPLATE.format( __version__, *Device.GPU_UTILIZATION_THRESHOLDS, *Device.MEMORY_UTILIZATION_THRESHOLDS, MouseEvent.CTRL_SCROLLWHEEL_MULTIPLIER, ) .strip() .splitlines() ) self.color_matrix = { 9: ('green', 'green'), 10: ('green', 'green'), 12: ('cyan', 'yellow'), 13: ('cyan', 'yellow'), 14: ('cyan', 'red'), 15: (None, 'red'), 16: ('cyan', 'red'), **dict.fromkeys(range(17, 20), ('cyan', 'green')), **dict.fromkeys(range(21, 23), ('blue', 'blue')), **dict.fromkeys(range(24, 28), ('blue', 'blue')), 28: ('magenta', 'magenta'), } self.x, self.y = root.x, root.y self.width = max(map(len, self.infos)) self.height = len(self.infos) def draw(self): if not self.need_redraw: return self.color_reset() for y, line in enumerate(self.infos, start=self.y): self.addstr(y, self.x, line) self.color_at(self.y, self.x, width=self.width, fg='cyan', attr='bold') self.color_at(self.y + 1, self.x, width=self.width, fg='cyan', attr='bold') self.color_at(self.y + self.height - 1, self.x, width=self.width, fg='cyan', attr='bold') self.color_at(self.y + 3, self.x, width=17, attr='bold') for dx in (18, 30, 43): self.color_at(self.y + 3, self.x + dx, width=1, fg='magenta', attr='bold') for dx in (21, 33, 48): self.color_at(self.y + 3, self.x + dx, width=1, attr='underline') self.color_at(self.y + 5, self.x, width=21, attr='bold') for dy in (6, 7): self.color_at(self.y + dy, self.x + 21, width=5, fg='green', attr='bold | italic') self.color_at(self.y + dy, self.x + 36, width=8, fg='yellow', attr='bold | italic') self.color_at(self.y + dy, self.x + 54, width=5, fg='red', attr='bold | italic') for dy, (left, right) in self.color_matrix.items(): if left is not None: self.color_at(self.y + dy, self.x, width=12, fg=left, attr='bold') if right is not None: self.color_at(self.y + dy, self.x + 39, width=13, fg=right, attr='bold') def press(self, key): self.root.keymaps.use_keymap('help') self.root.press(key) nvitop-1.4.2/nvitop/tui/screens/main/000077500000000000000000000000001474547113600175575ustar00rootroot00000000000000nvitop-1.4.2/nvitop/tui/screens/main/__init__.py000066400000000000000000000234361474547113600217000ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import threading from functools import partial from nvitop.tui.library import LARGE_INTEGER, DisplayableContainer, MouseEvent, send_signal from nvitop.tui.screens.main.device import DevicePanel from nvitop.tui.screens.main.host import HostPanel from nvitop.tui.screens.main.process import ProcessPanel class BreakLoop(Exception): # noqa: N818 pass class MainScreen(DisplayableContainer): # pylint: disable=too-many-instance-attributes NAME = 'main' # pylint: disable-next=redefined-builtin,too-many-arguments,too-many-locals,too-many-statements def __init__(self, devices, filters, *, ascii, mode, win, root): super().__init__(win, root) self.width = root.width assert mode in {'auto', 'full', 'compact'} compact = mode == 'compact' self.mode = mode self._compact = compact self.devices = devices self.device_count = len(self.devices) self.snapshot_lock = threading.Lock() self.device_panel = DevicePanel(self.devices, compact, win=win, root=root) self.device_panel.focused = False self.add_child(self.device_panel) self.host_panel = HostPanel(self.device_panel.leaf_devices, compact, win=win, root=root) self.host_panel.focused = False self.add_child(self.host_panel) self.process_panel = ProcessPanel( self.device_panel.leaf_devices, compact, filters, win=win, root=root, ) self.process_panel.focused = False self.add_child(self.process_panel) self.selection = self.process_panel.selection self.ascii = ascii self.device_panel.ascii = self.ascii self.host_panel.ascii = self.ascii self.process_panel.ascii = self.ascii if ascii: self.host_panel.full_height = self.host_panel.height = self.host_panel.compact_height self.x, self.y = root.x, root.y self.device_panel.x = self.host_panel.x = self.process_panel.x = self.x self.device_panel.y = self.y self.host_panel.y = self.device_panel.y + self.device_panel.height self.process_panel.y = self.host_panel.y + self.host_panel.height self.height = self.device_panel.height + self.host_panel.height + self.process_panel.height @property def compact(self): return self._compact @compact.setter def compact(self, value): if self._compact != value: self.need_redraw = True self._compact = value def update_size(self, termsize=None): n_term_lines, n_term_cols = termsize = super().update_size(termsize=termsize) self.width = n_term_cols - self.x self.device_panel.width = self.width self.host_panel.width = self.width self.process_panel.width = self.width self.y = min(self.y, self.root.y) height = n_term_lines - self.y heights = [ self.device_panel.full_height + self.host_panel.full_height + self.process_panel.full_height, self.device_panel.compact_height + self.host_panel.full_height + self.process_panel.full_height, self.device_panel.compact_height + self.host_panel.compact_height + self.process_panel.full_height, ] if self.mode == 'auto': self.compact = height < heights[0] self.host_panel.compact = height < heights[1] self.process_panel.compact = height < heights[-1] else: self.compact = self.mode == 'compact' self.host_panel.compact = self.compact self.process_panel.compact = self.compact self.device_panel.compact = self.compact self.device_panel.y = self.y self.host_panel.y = self.device_panel.y + self.device_panel.height self.process_panel.y = self.host_panel.y + self.host_panel.height height = self.device_panel.height + self.host_panel.height + self.process_panel.height if self.y < self.root.y and self.y + height < n_term_lines: self.y = min(self.root.y + self.root.height - height, self.root.y) self.update_size(termsize) self.need_redraw = True if self.height != height: self.height = height self.need_redraw = True return termsize def move(self, direction=0): if direction == 0: return self.y -= direction self.update_size() self.need_redraw = True def poke(self): super().poke() height = self.device_panel.height + self.host_panel.height + self.process_panel.height if self.height != height: self.update_size() self.need_redraw = True def draw(self): self.color_reset() super().draw() def print(self): if self.device_count > 0: print_width = min(panel.print_width() for panel in self.container) self.width = max(print_width, min(self.width, 100)) else: self.width = 79 for panel in self.container: panel.width = self.width panel.print() def __contains__(self, item): if self.visible and isinstance(item, MouseEvent): return True return super().__contains__(item) def init_keybindings(self): # pylint: disable=too-many-locals,too-many-statements def quit(): # pylint: disable=redefined-builtin raise BreakLoop def change_mode(mode): self.mode = mode self.root.update_size() def force_refresh(): select_clear() host_begin() self.y = self.root.y self.root.update_size() self.root.need_redraw = True def screen_move(direction): self.move(direction) def host_left(): self.process_panel.host_offset -= 2 def host_right(): self.process_panel.host_offset += 2 def host_begin(): self.process_panel.host_offset = -1 def host_end(): self.process_panel.host_offset = LARGE_INTEGER def select_move(direction): self.selection.move(direction=direction) def select_clear(): self.selection.clear() def tag(): self.selection.tag() select_move(direction=+1) def sort_by(order, reverse): self.process_panel.order = order self.process_panel.reverse = reverse self.root.update_size() def order_previous(): sort_by(order=ProcessPanel.ORDERS[self.process_panel.order].previous, reverse=False) def order_next(): sort_by(order=ProcessPanel.ORDERS[self.process_panel.order].next, reverse=False) def order_reverse(): sort_by(order=self.process_panel.order, reverse=not self.process_panel.reverse) keymaps = self.root.keymaps keymaps.bind('main', 'q', quit) keymaps.copy('main', 'q', 'Q') keymaps.bind('main', 'a', partial(change_mode, mode='auto')) keymaps.bind('main', 'f', partial(change_mode, mode='full')) keymaps.bind('main', 'c', partial(change_mode, mode='compact')) keymaps.bind('main', 'r', force_refresh) keymaps.copy('main', 'r', 'R') keymaps.copy('main', 'r', '') keymaps.copy('main', 'r', '') keymaps.bind('main', '', partial(screen_move, direction=-1)) keymaps.copy('main', '', '[') keymaps.copy('main', '', '') keymaps.bind('main', '', partial(screen_move, direction=+1)) keymaps.copy('main', '', ']') keymaps.copy('main', '', '') keymaps.bind('main', '', host_left) keymaps.copy('main', '', '') keymaps.bind('main', '', host_right) keymaps.copy('main', '', '') keymaps.bind('main', '', host_begin) keymaps.copy('main', '', '^') keymaps.bind('main', '', host_end) keymaps.copy('main', '', '$') keymaps.bind('main', '', partial(select_move, direction=-1)) keymaps.copy('main', '', '') keymaps.copy('main', '', '') keymaps.bind('main', '', partial(select_move, direction=+1)) keymaps.copy('main', '', '') keymaps.copy('main', '', '') keymaps.bind('main', '', partial(select_move, direction=-(1 << 20))) keymaps.bind('main', '', partial(select_move, direction=+(1 << 20))) keymaps.bind('main', '', select_clear) keymaps.bind('main', '', tag) keymaps.bind('main', 'T', partial(send_signal, signal='terminate', panel=self)) keymaps.bind('main', 'K', partial(send_signal, signal='kill', panel=self)) keymaps.copy('main', 'K', 'k') keymaps.bind('main', '', partial(send_signal, signal='interrupt', panel=self)) keymaps.copy('main', '', 'I') keymaps.bind('main', ',', order_previous) keymaps.copy('main', ',', '<') keymaps.bind('main', '.', order_next) keymaps.copy('main', '.', '>') keymaps.bind('main', '/', order_reverse) for order in ProcessPanel.ORDERS: keymaps.bind( 'main', 'o' + order[:1].lower(), partial(sort_by, order=order, reverse=False), ) keymaps.bind( 'main', 'o' + order[:1].upper(), partial(sort_by, order=order, reverse=True), ) nvitop-1.4.2/nvitop/tui/screens/main/device.py000066400000000000000000000532271474547113600214010ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import threading import time from nvitop.tui.library import ( NA, Device, Displayable, colored, cut_string, host, make_bar, ttl_cache, ) from nvitop.version import __version__ class DevicePanel(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'device' SNAPSHOT_INTERVAL = 0.5 def __init__(self, devices, compact, win, root): super().__init__(win, root) self.devices = devices self.device_count = len(self.devices) self.all_devices = [] self.leaf_devices = [] self.mig_device_counts = [0] * self.device_count self.mig_enabled_device_count = 0 for i, device in enumerate(self.devices): self.all_devices.append(device) mig_devices = device.mig_devices() self.mig_device_counts[i] = len(mig_devices) if self.mig_device_counts[i] > 0: self.all_devices.extend(mig_devices) self.leaf_devices.extend(mig_devices) self.mig_enabled_device_count += 1 else: self.leaf_devices.append(device) self.mig_device_count = sum(self.mig_device_counts) self.all_device_count = len(self.all_devices) self.leaf_device_count = len(self.leaf_devices) self._compact = compact self.width = max(79, root.width) self.compact_height = ( 4 + 2 * (self.device_count + 1) + self.mig_device_count + self.mig_enabled_device_count ) self.full_height = self.compact_height + self.device_count + 1 self.height = self.compact_height if compact else self.full_height if self.device_count == 0: self.height = self.full_height = self.compact_height = 6 self.driver_version = Device.driver_version() self.cuda_driver_version = Device.cuda_driver_version() self._snapshot_buffer = [] self._snapshots = [] self.snapshot_lock = threading.Lock() self.snapshots = self.take_snapshots() self._snapshot_daemon = threading.Thread( name='device-snapshot-daemon', target=self._snapshot_target, daemon=True, ) self._daemon_running = threading.Event() self.formats_compact = [ '│ {physical_index:>3} {fan_speed_string:>3} {temperature_string:>4} ' '{performance_state:>3} {power_status:>12} ' '│ {memory_usage:>20} │ {gpu_utilization_string:>7} {compute_mode:>11} │', ] self.formats_full = [ '│ {physical_index:>3} {name:<18} {persistence_mode:<4} ' '│ {bus_id:<16} {display_active:>3} │ {total_volatile_uncorrected_ecc_errors:>20} │', '│ {fan_speed_string:>3} {temperature_string:>4} {performance_state:>4} {power_status:>12} ' '│ {memory_usage:>20} │ {gpu_utilization_string:>7} {compute_mode:>11} │', ] self.mig_formats = [ '│{physical_index:>2}:{mig_index:<2}{name:>12} @ GI/CI:{gpu_instance_id:>2}/{compute_instance_id:<2}' '│ {memory_usage:>20} │ BAR1: {bar1_memory_used_human:>8} / {bar1_memory_percent_string:>3} │', ] if host.WINDOWS: self.formats_full[0] = self.formats_full[0].replace( 'persistence_mode', 'current_driver_model', ) self.support_mig = any('N/A' not in device.mig_mode for device in self.snapshots) if self.support_mig: self.formats_full[0] = self.formats_full[0].replace( '{total_volatile_uncorrected_ecc_errors:>20}', '{mig_mode:>8} {total_volatile_uncorrected_ecc_errors:>10}', ) @property def width(self): return self._width @width.setter def width(self, value): width = max(79, value) if self._width != width and self.visible: self.need_redraw = True self._width = width @property def compact(self): return self._compact @compact.setter def compact(self, value): if self._compact != value: self.need_redraw = True self._compact = value self.height = self.compact_height if self.compact else self.full_height @property def snapshots(self): return self._snapshots @snapshots.setter def snapshots(self, snapshots): with self.snapshot_lock: self._snapshots = snapshots @classmethod def set_snapshot_interval(cls, interval): assert interval > 0.0 interval = float(interval) cls.SNAPSHOT_INTERVAL = min(interval / 3.0, 1.0) cls.take_snapshots = ttl_cache(ttl=interval)( cls.take_snapshots.__wrapped__, # pylint: disable=no-member ) @ttl_cache(ttl=1.0) def take_snapshots(self): snapshots = [device.as_snapshot() for device in self.all_devices] for device in snapshots: if device.name.startswith('NVIDIA '): device.name = device.name.replace('NVIDIA ', '', 1) if device.is_mig_device: device.name = device.name.rpartition(' ')[-1] if device.bar1_memory_percent is not NA: device.bar1_memory_percent = round(device.bar1_memory_percent) if device.bar1_memory_percent >= 100: device.bar1_memory_percent_string = 'MAX' else: device.bar1_memory_percent_string = f'{round(device.bar1_memory_percent)}%' else: device.name = cut_string(device.name, maxlen=18, padstr='..', align='right') device.current_driver_model = device.current_driver_model.replace('WDM', 'TCC') device.display_active = device.display_active.replace('Enabled', 'On').replace( 'Disabled', 'Off', ) device.persistence_mode = device.persistence_mode.replace('Enabled', 'On').replace( 'Disabled', 'Off', ) device.mig_mode = device.mig_mode.replace('N/A', 'N/A ') device.compute_mode = device.compute_mode.replace('Exclusive', 'E.') if device.fan_speed is not NA and device.fan_speed >= 100: device.fan_speed_string = 'MAX' with self.snapshot_lock: self._snapshot_buffer = snapshots return snapshots def _snapshot_target(self): self._daemon_running.wait() while self._daemon_running.is_set(): self.take_snapshots() time.sleep(self.SNAPSHOT_INTERVAL) def header_lines(self, compact=None): if compact is None: compact = self.compact version_infos = [ 'NVITOP {}'.format(__version__.partition('+')[0]), f'Driver Version: {self.driver_version}', f'CUDA Driver Version: {self.cuda_driver_version}', ] if sum(len(v) for v in version_infos) % 2 == 0: version_infos[0] += ' ' version_seps = ' ' * max(2, (75 - sum(len(v) for v in version_infos)) // 2) header = [ '╒═════════════════════════════════════════════════════════════════════════════╕', '│ {} │'.format(version_seps.join(version_infos).ljust(75, ' ')), ] if self.device_count > 0: header.append( '├───────────────────────────────┬──────────────────────┬──────────────────────┤', ) if compact: header.append( '│ GPU Fan Temp Perf Pwr:Usg/Cap │ Memory-Usage │ GPU-Util Compute M. │', ) else: header.extend( ( '│ GPU Name Persistence-M│ Bus-Id Disp.A │ Volatile Uncorr. ECC │', '│ Fan Temp Perf Pwr:Usage/Cap│ Memory-Usage │ GPU-Util Compute M. │', ), ) if host.WINDOWS: header[-2] = header[-2].replace('Persistence-M', ' TCC/WDDM ') if self.support_mig: header[-2] = header[-2].replace('Volatile Uncorr. ECC', 'MIG M. Uncorr. ECC') header.append( '╞═══════════════════════════════╪══════════════════════╪══════════════════════╡', ) else: header.extend( ( '╞═════════════════════════════════════════════════════════════════════════════╡', '│ No visible devices found │', '╘═════════════════════════════════════════════════════════════════════════════╛', ), ) return header def frame_lines(self, compact=None): if compact is None: compact = self.compact frame = self.header_lines(compact) remaining_width = self.width - 79 data_line = ( '│ │ │ │' ) separator_line = ( '├───────────────────────────────┼──────────────────────┼──────────────────────┤' ) if self.width >= 100: data_line += ' ' * (remaining_width - 1) + '│' separator_line = separator_line[:-1] + '┼' + '─' * (remaining_width - 1) + '┤' if self.device_count > 0: for mig_device_count in self.mig_device_counts: if compact: frame.extend((data_line, separator_line)) else: frame.extend((data_line, data_line, separator_line)) if mig_device_count > 0: frame.extend((data_line,) * mig_device_count + (separator_line,)) frame.pop() frame.append( '╘═══════════════════════════════╧══════════════════════╧══════════════════════╛', ) if self.width >= 100: frame[5 - int(compact)] = ( frame[5 - int(compact)][:-1] + '╪' + '═' * (remaining_width - 1) + '╕' ) frame[-1] = frame[-1][:-1] + '╧' + '═' * (remaining_width - 1) + '╛' return frame def poke(self): if not self._daemon_running.is_set(): self._daemon_running.set() self._snapshot_daemon.start() self.snapshots = self._snapshot_buffer super().poke() # pylint: disable-next=too-many-locals,too-many-branches,too-many-statements def draw(self): self.color_reset() if self.need_redraw: self.addstr(self.y, self.x, '(Press h for help or q to quit)'.rjust(79)) self.color_at(self.y, self.x + 55, width=1, fg='magenta', attr='bold | italic') self.color_at(self.y, self.x + 69, width=1, fg='magenta', attr='bold | italic') for y, line in enumerate(self.frame_lines(), start=self.y + 1): self.addstr(y, self.x, line) self.addstr(self.y, self.x, cut_string(time.strftime('%a %b %d %H:%M:%S %Y'), maxlen=32)) formats = self.formats_compact if self.compact else self.formats_full remaining_width = self.width - 79 draw_bars = self.width >= 100 try: selected_device = self.parent.selection.process.device except AttributeError: selected_device = None y_start = self.y + len(formats) + 5 prev_device_index = self.snapshots[0].tuple_index for index, device in enumerate(self.snapshots): # pylint: disable=too-many-nested-blocks if ( len(prev_device_index) != len(device.tuple_index) or prev_device_index[0] != device.tuple_index[0] ): y_start += 1 attr = 0 if selected_device is not None: if device.real == selected_device: attr = 'bold' elif ( device.is_mig_device or device.physical_index != selected_device.physical_index ): attr = 'dim' fmts = self.mig_formats if device.is_mig_device else formats for y, fmt in enumerate(fmts, start=y_start): self.addstr(y, self.x, fmt.format(**device.__dict__)) self.color_at(y, self.x + 1, width=31, fg=device.display_color, attr=attr) self.color_at(y, self.x + 33, width=22, fg=device.display_color, attr=attr) self.color_at(y, self.x + 56, width=22, fg=device.display_color, attr=attr) if draw_bars: matrix = [ ( self.x + 80, y_start, remaining_width - 3, 'MEM', device.memory_percent, device.memory_display_color, ), ( self.x + 80, y_start + 1, remaining_width - 3, 'UTL', device.gpu_utilization, device.gpu_display_color, ), ] if self.compact: if remaining_width >= 44 and not device.is_mig_device: left_width = (remaining_width - 6 + 1) // 2 - 1 right_width = (remaining_width - 6) // 2 + 1 matrix = [ ( self.x + 80, y_start, left_width, 'MEM', device.memory_percent, device.memory_display_color, ), ( self.x + 80 + left_width + 3, y_start, right_width, 'UTL', device.gpu_utilization, device.gpu_display_color, ), ] separator = '┼' if index > 0 else '╤' if len(prev_device_index) == 2: separator = '┬' self.addstr(y_start - 1, self.x + 80 + left_width + 1, separator) self.addstr(y_start, self.x + 80 + left_width + 1, '│') if index == len(self.snapshots) - 1: self.addstr(y_start + 1, self.x + 80 + left_width + 1, '╧') else: if remaining_width >= 44 and len(prev_device_index) == 1: self.addstr(y_start - 1, self.x + 80 + left_width + 1, '┴') matrix.pop() elif device.is_mig_device: matrix.pop() for x_offset, y, width, prefix, utilization, color in matrix: # pylint: disable-next=disallowed-name bar = make_bar(prefix, utilization, width=width) self.addstr(y, x_offset, bar) if self.TERM_256COLOR: parts = bar.rstrip().split(' ') prefix_len = len(parts[0]) bar_len = len(parts[1]) full_bar_len = width - prefix_len - 5 self.color_at(y, x_offset, width=width, fg=float(bar_len / full_bar_len)) for i, x in enumerate( range(x_offset + prefix_len + 1, x_offset + prefix_len + 1 + bar_len), ): self.color_at(y, x, width=1, fg=float(i / full_bar_len)) else: self.color_at(y, x_offset, width=width, fg=color, attr=attr) y_start += len(fmts) prev_device_index = device.tuple_index def destroy(self): super().destroy() self._daemon_running.clear() def print_width(self): if self.device_count > 0 and self.width >= 100: return self.width return 79 def print(self): # pylint: disable=too-many-locals,too-many-branches lines = [time.strftime('%a %b %d %H:%M:%S %Y'), *self.header_lines(compact=False)] if self.device_count > 0: prev_device_index = self.snapshots[0].tuple_index for device in self.snapshots: if ( len(prev_device_index) != len(device.tuple_index) or prev_device_index[0] != device.tuple_index[0] ): lines.append( '├───────────────────────────────┼──────────────────────┼──────────────────────┤', ) def colorize(s): if len(s) > 0: # pylint: disable-next=cell-var-from-loop return colored(s, device.display_color) # noqa: B023 return '' fmts = self.mig_formats if device.is_mig_device else self.formats_full for fmt in fmts: line = fmt.format(**device.__dict__) lines.append('│'.join(map(colorize, line.split('│')))) prev_device_index = device.tuple_index lines.append( '╘═══════════════════════════════╧══════════════════════╧══════════════════════╛', ) if self.width >= 100: remaining_width = self.width - 79 y_start = 7 prev_device_index = self.snapshots[0].tuple_index for index, device in enumerate(self.snapshots): if ( len(prev_device_index) != len(device.tuple_index) or prev_device_index[0] != device.tuple_index[0] ): lines[y_start] = ( lines[y_start][:-1] + '┼' + '─' * (remaining_width - 1) + '┤' ) y_start += 1 elif index == 0: lines[y_start - 1] = ( lines[y_start - 1][:-1] + '╪' + '═' * (remaining_width - 1) + '╕' ) matrix = [ ( 'MEM', device.memory_percent, device.memory_display_color, ), ( 'UTL', device.gpu_utilization, device.gpu_display_color, ), ] if device.is_mig_device: matrix.pop() for y, (prefix, utilization, color) in enumerate(matrix, start=y_start): bar = make_bar( # pylint: disable=disallowed-name prefix, utilization, width=remaining_width - 3, ) lines[y] += f' {colored(bar, color)} │' if index == len(self.snapshots) - 1: lines[y_start + len(matrix)] = ( lines[y_start + len(matrix)][:-1] + '╧' + '═' * (remaining_width - 1) + '╛' ) y_start += len(matrix) prev_device_index = device.tuple_index lines = '\n'.join(lines) if self.ascii: lines = lines.translate(self.ASCII_TRANSTABLE) try: print(lines) except UnicodeError: print(lines.translate(self.ASCII_TRANSTABLE)) def press(self, key): self.root.keymaps.use_keymap('device') self.root.press(key) nvitop-1.4.2/nvitop/tui/screens/main/host.py000066400000000000000000000402071474547113600211110ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import threading import time from nvitop.tui.library import ( NA, BufferedHistoryGraph, Device, Displayable, GiB, bytes2human, colored, host, make_bar, timedelta2human, ) class HostPanel(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'host' SNAPSHOT_INTERVAL = 0.5 def __init__(self, devices, compact, win, root): super().__init__(win, root) self.devices = devices self.device_count = len(self.devices) if win is not None: self.average_gpu_memory_percent = None self.average_gpu_utilization = None self.enable_history() self._compact = compact self.width = max(79, root.width) self.full_height = 12 self.compact_height = 2 self.height = self.compact_height if compact else self.full_height self.cpu_percent = None self.load_average = None self.virtual_memory = None self.swap_memory = None self._snapshot_daemon = threading.Thread( name='host-snapshot-daemon', target=self._snapshot_target, daemon=True, ) self._daemon_running = threading.Event() @property def width(self): return self._width @width.setter def width(self, value): width = max(79, value) if self._width != width: if self.visible: self.need_redraw = True graph_width = max(width - 80, 20) if self.win is not None: self.average_gpu_memory_percent.width = graph_width self.average_gpu_utilization.width = graph_width for device in self.devices: device.memory_percent.history.width = graph_width device.gpu_utilization.history.width = graph_width self._width = width @property def compact(self): return self._compact or self.ascii @compact.setter def compact(self, value): value = value or self.ascii if self._compact != value: self.need_redraw = True self._compact = value self.height = self.compact_height if self.compact else self.full_height def enable_history(self): host.cpu_percent = BufferedHistoryGraph( interval=1.0, width=77, height=5, upsidedown=False, baseline=0.0, upperbound=100.0, dynamic_bound=False, format='CPU: {:.1f}%'.format, )(host.cpu_percent) host.virtual_memory = BufferedHistoryGraph( interval=1.0, width=77, height=4, upsidedown=True, baseline=0.0, upperbound=100.0, dynamic_bound=False, format='{:.1f}%'.format, )(host.virtual_memory, get_value=lambda vm: vm.percent) host.swap_memory = BufferedHistoryGraph( interval=1.0, width=77, height=1, upsidedown=False, baseline=0.0, upperbound=100.0, dynamic_bound=False, format='{:.1f}%'.format, )(host.swap_memory, get_value=lambda sm: sm.percent) def percentage(x): return f'{x:.1f}%' if x is not NA else NA def enable_history(device): device.memory_percent = BufferedHistoryGraph( interval=1.0, width=20, height=5, upsidedown=False, baseline=0.0, upperbound=100.0, dynamic_bound=False, format=lambda x: f'GPU {device.display_index} MEM: {percentage(x)}', )(device.memory_percent) device.gpu_utilization = BufferedHistoryGraph( interval=1.0, width=20, height=5, upsidedown=True, baseline=0.0, upperbound=100.0, dynamic_bound=False, format=lambda x: f'GPU {device.display_index} UTL: {percentage(x)}', )(device.gpu_utilization) for device in self.devices: enable_history(device) prefix = 'AVG ' if self.device_count > 1 else '' self.average_gpu_memory_percent = BufferedHistoryGraph( interval=1.0, width=20, height=5, upsidedown=False, baseline=0.0, upperbound=100.0, dynamic_bound=False, format=lambda x: f'{prefix}GPU MEM: {percentage(x)}', ) self.average_gpu_utilization = BufferedHistoryGraph( interval=1.0, width=20, height=5, upsidedown=True, baseline=0.0, upperbound=100.0, dynamic_bound=False, format=lambda x: f'{prefix}GPU UTL: {percentage(x)}', ) @classmethod def set_snapshot_interval(cls, interval): assert interval > 0.0 interval = float(interval) cls.SNAPSHOT_INTERVAL = min(interval / 3.0, 0.5) def take_snapshots(self): host.cpu_percent() host.virtual_memory() host.swap_memory() self.load_average = host.load_average() self.cpu_percent = host.cpu_percent.history.last_value self.virtual_memory = host.virtual_memory.history.last_retval self.swap_memory = host.swap_memory.history.last_retval total_memory_used = 0 total_memory_total = 0 gpu_utilizations = [] for device in self.devices: memory_used = device.snapshot.memory_used memory_total = device.snapshot.memory_total gpu_utilization = device.snapshot.gpu_utilization if memory_used is not NA and memory_total is not NA: total_memory_used += memory_used total_memory_total += memory_total if gpu_utilization is not NA: gpu_utilizations.append(float(gpu_utilization)) if total_memory_total > 0: self.average_gpu_memory_percent.add(100.0 * total_memory_used / total_memory_total) if len(gpu_utilizations) > 0: self.average_gpu_utilization.add(sum(gpu_utilizations) / len(gpu_utilizations)) def _snapshot_target(self): self._daemon_running.wait() while self._daemon_running.is_set(): self.take_snapshots() time.sleep(self.SNAPSHOT_INTERVAL) def frame_lines(self, compact=None): if compact is None: compact = self.compact if compact or self.ascii: return [] remaining_width = self.width - 79 data_line = ( '│ │' ) separator_line = ( '├────────────╴120s├─────────────────────────╴60s├──────────╴30s├──────────────┤' ) if self.width >= 100: data_line += ' ' * (remaining_width - 1) + '│' separator_line = separator_line[:-1] + '┼' + '─' * (remaining_width - 1) + '┤' frame = [ '╞═══════════════════════════════╧══════════════════════╧══════════════════════╡', data_line, data_line, data_line, data_line, data_line, separator_line, data_line, data_line, data_line, data_line, data_line, '╘═════════════════════════════════════════════════════════════════════════════╛', ] if self.width >= 100: frame[0] = frame[0][:-1] + '╪' + '═' * (remaining_width - 1) + '╡' frame[-1] = frame[-1][:-1] + '╧' + '═' * (remaining_width - 1) + '╛' return frame def poke(self): if not self._daemon_running.is_set(): self._daemon_running.set() self._snapshot_daemon.start() self.take_snapshots() super().poke() def draw(self): # pylint: disable=too-many-locals,too-many-branches,too-many-statements self.color_reset() if self.load_average is not None: load_average = tuple( f'{value:5.2f}'[:5] if value < 10000.0 else '9999+' for value in self.load_average ) else: load_average = (NA,) * 3 load_average = 'Load Average: {} {} {}'.format(*load_average) if self.compact: width_right = len(load_average) + 4 width_left = self.width - 2 - width_right cpu_bar = '[ {} ]'.format( make_bar( 'CPU', self.cpu_percent, width_left - 4, extra_text=f' UPTIME: {timedelta2human(host.uptime(), round=True)}', ), ) memory_bar = '[ {} ]'.format( make_bar( 'MEM', self.virtual_memory.percent, width_left - 4, extra_text=f' USED: {bytes2human(self.virtual_memory.used, min_unit=GiB)}', ), ) swap_bar = '[ {} ]'.format(make_bar('SWP', self.swap_memory.percent, width_right - 4)) self.addstr(self.y, self.x, f'{cpu_bar} ( {load_average} )') self.addstr(self.y + 1, self.x, f'{memory_bar} {swap_bar}') self.color_at(self.y, self.x, width=len(cpu_bar), fg='cyan', attr='bold') self.color_at(self.y + 1, self.x, width=width_left, fg='magenta', attr='bold') self.color_at(self.y, self.x + width_left + 2, width=width_right, attr='bold') self.color_at( self.y + 1, self.x + width_left + 2, width=width_right, fg='blue', attr='bold', ) return remaining_width = self.width - 79 if self.need_redraw: for y, line in enumerate(self.frame_lines(), start=self.y - 1): self.addstr(y, self.x, line) self.color_at(self.y + 5, self.x + 14, width=4, attr='dim') self.color_at(self.y + 5, self.x + 45, width=3, attr='dim') self.color_at(self.y + 5, self.x + 60, width=3, attr='dim') if self.width >= 100: for offset, string in ( (20, '╴30s├'), (35, '╴60s├'), (66, '╴120s├'), (96, '╴180s├'), (126, '╴240s├'), (156, '╴300s├'), ): if offset > remaining_width: break self.addstr(self.y + 5, self.x + self.width - offset, string) self.color_at( self.y + 5, self.x + self.width - offset + 1, width=len(string) - 2, attr='dim', ) self.color(fg='cyan') for y, line in enumerate(host.cpu_percent.history.graph, start=self.y): self.addstr(y, self.x + 1, line) self.color(fg='magenta') for y, line in enumerate(host.virtual_memory.history.graph, start=self.y + 6): self.addstr(y, self.x + 1, line) self.color(fg='blue') for y, line in enumerate(host.swap_memory.history.graph, start=self.y + 10): self.addstr(y, self.x + 1, line) if self.width >= 100: if self.device_count > 1 and self.parent.selection.is_set(): device = self.parent.selection.process.device gpu_memory_percent = device.memory_percent.history gpu_utilization = device.gpu_utilization.history else: gpu_memory_percent = self.average_gpu_memory_percent gpu_utilization = self.average_gpu_utilization if self.TERM_256COLOR: for i, (y, line) in enumerate(enumerate(gpu_memory_percent.graph, start=self.y)): self.addstr(y, self.x + 79, line, self.get_fg_bg_attr(fg=1.0 - i / 4.0)) for i, (y, line) in enumerate(enumerate(gpu_utilization.graph, start=self.y + 6)): self.addstr(y, self.x + 79, line, self.get_fg_bg_attr(fg=i / 4.0)) else: self.color(fg=Device.color_of(gpu_memory_percent.last_value, type='memory')) for y, line in enumerate(gpu_memory_percent.graph, start=self.y): self.addstr(y, self.x + 79, line) self.color(fg=Device.color_of(gpu_utilization.last_value, type='gpu')) for y, line in enumerate(gpu_utilization.graph, start=self.y + 6): self.addstr(y, self.x + 79, line) self.color_reset() self.addstr(self.y, self.x + 1, f' {load_average} ') self.addstr(self.y + 1, self.x + 1, f' {host.cpu_percent.history} ') self.addstr( self.y + 9, self.x + 1, f' MEM: {bytes2human(self.virtual_memory.used, min_unit=GiB)} ({host.virtual_memory.history}) ', ) self.addstr( self.y + 10, self.x + 1, f' SWP: {bytes2human(self.swap_memory.used, min_unit=GiB)} ({host.swap_memory.history}) ', ) if self.width >= 100: self.addstr(self.y, self.x + 79, f' {gpu_memory_percent} ') self.addstr(self.y + 10, self.x + 79, f' {gpu_utilization} ') def destroy(self): super().destroy() self._daemon_running.clear() def print_width(self): if self.device_count > 0 and self.width >= 100: return self.width return 79 def print(self): self.cpu_percent = host.cpu_percent() self.virtual_memory = host.virtual_memory() self.swap_memory = host.swap_memory() self.load_average = host.load_average() if self.load_average is not None: load_average = tuple( f'{value:5.2f}'[:5] if value < 10000.0 else '9999+' for value in self.load_average ) else: load_average = (NA,) * 3 load_average = 'Load Average: {} {} {}'.format(*load_average) width_right = len(load_average) + 4 width_left = self.width - 2 - width_right cpu_bar = '[ {} ]'.format( make_bar( 'CPU', self.cpu_percent, width_left - 4, extra_text=f' UPTIME: {timedelta2human(host.uptime(), round=True)}', ), ) memory_bar = '[ {} ]'.format( make_bar( 'MEM', self.virtual_memory.percent, width_left - 4, extra_text=f' USED: {bytes2human(self.virtual_memory.used, min_unit=GiB)}', ), ) swap_bar = '[ {} ]'.format(make_bar('SWP', self.swap_memory.percent, width_right - 4)) lines = [ '{} {}'.format( colored(cpu_bar, color='cyan', attrs=('bold',)), colored(f'( {load_average} )', attrs=('bold',)), ), '{} {}'.format( colored(memory_bar, color='magenta', attrs=('bold',)), colored(swap_bar, color='blue', attrs=('bold',)), ), ] lines = '\n'.join(lines) if self.ascii: lines = lines.translate(self.ASCII_TRANSTABLE) try: print(lines) except UnicodeError: print(lines.translate(self.ASCII_TRANSTABLE)) def press(self, key): self.root.keymaps.use_keymap('host') self.root.press(key) nvitop-1.4.2/nvitop/tui/screens/main/process.py000066400000000000000000000602661474547113600216210ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring from __future__ import annotations import itertools import threading import time from operator import attrgetter, xor from typing import TYPE_CHECKING, Any, NamedTuple from nvitop.tui.library import ( HOSTNAME, LARGE_INTEGER, SUPERUSER, USERCONTEXT, USERNAME, Displayable, GpuProcess, MouseEvent, Selection, WideString, colored, cut_string, host, ttl_cache, wcslen, ) if TYPE_CHECKING: from collections.abc import Callable class Order(NamedTuple): key: Callable[[Any], Any] reverse: bool offset: int column: str previous: str next: str class ProcessPanel(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'process' SNAPSHOT_INTERVAL = 0.5 ORDERS = { 'natural': Order( key=attrgetter('device.tuple_index', '_gone', 'username', 'pid'), reverse=False, offset=3, column='ID', previous='time', next='pid', ), 'pid': Order( key=attrgetter('_gone', 'pid', 'device.tuple_index'), reverse=False, offset=10, column='PID', previous='natural', next='username', ), 'username': Order( key=attrgetter('_gone', 'username', 'pid', 'device.tuple_index'), reverse=False, offset=19, column='USER', previous='pid', next='gpu_memory', ), 'gpu_memory': Order( key=attrgetter( '_gone', 'gpu_memory', 'gpu_sm_utilization', 'cpu_percent', 'pid', 'device.tuple_index', ), reverse=True, offset=25, column='GPU-MEM', previous='username', next='sm_utilization', ), 'sm_utilization': Order( key=attrgetter( '_gone', 'gpu_sm_utilization', 'gpu_memory', 'cpu_percent', 'pid', 'device.tuple_index', ), reverse=True, offset=34, column='SM', previous='gpu_memory', next='cpu_percent', ), 'cpu_percent': Order( key=attrgetter('_gone', 'cpu_percent', 'memory_percent', 'pid', 'device.tuple_index'), reverse=True, offset=38, column='%CPU', previous='sm_utilization', next='memory_percent', ), 'memory_percent': Order( key=attrgetter('_gone', 'memory_percent', 'cpu_percent', 'pid', 'device.tuple_index'), reverse=True, offset=44, column='%MEM', previous='cpu_percent', next='time', ), 'time': Order( key=attrgetter('_gone', 'running_time', 'pid', 'device.tuple_index'), reverse=True, offset=50, column='TIME', previous='memory_percent', next='natural', ), } # pylint: disable-next=too-many-arguments def __init__(self, devices, compact, filters, *, win, root): super().__init__(win, root) self.devices = devices self._compact = compact self.width = max(79, root.width) self.height = self._full_height = self.compact_height = 7 self.filters = [None, *filters] self.host_headers = ['%CPU', '%MEM', 'TIME', 'COMMAND'] self.selection = Selection(panel=self) self.host_offset = -1 self.y_mouse = None self._order = 'natural' self.reverse = False self.has_snapshots = False self._snapshot_buffer = None self._snapshots = [] self.snapshot_lock = threading.Lock() self._snapshot_daemon = threading.Thread( name='process-snapshot-daemon', target=self._snapshot_target, daemon=True, ) self._daemon_running = threading.Event() @property def width(self): return self._width @width.setter def width(self, value): width = max(79, value) if self._width != width and self.visible: self.need_redraw = True self._width = width @property def compact(self): return self._compact or self.order != 'natural' @compact.setter def compact(self, value): if self._compact != value: self.need_redraw = True self._compact = value processes = self.snapshots n_processes, n_devices = ( len(processes), len({p.device.physical_index for p in processes}), ) self.full_height = 1 + max(6, 5 + n_processes + n_devices - 1) self.compact_height = 1 + max(6, 5 + n_processes) self.height = self.compact_height if self.compact else self.full_height @property def full_height(self): return self._full_height if self.order == 'natural' else self.compact_height @full_height.setter def full_height(self, value): self._full_height = value @property def order(self): return self._order @order.setter def order(self, value): if self._order != value: self._order = value self.height = self.compact_height if self.compact else self.full_height @property def snapshots(self): return self._snapshots @snapshots.setter def snapshots(self, snapshots): if snapshots is None: return self.has_snapshots = True time_length = max(4, max((len(p.running_time_human) for p in snapshots), default=4)) time_header = ' ' * (time_length - 4) + 'TIME' info_length = max((len(p.host_info) for p in snapshots), default=0) n_processes, n_devices = len(snapshots), len({p.device.physical_index for p in snapshots}) self.full_height = 1 + max(6, 5 + n_processes + n_devices - 1) self.compact_height = 1 + max(6, 5 + n_processes) height = self.compact_height if self.compact else self.full_height key, reverse, *_ = self.ORDERS[self.order] snapshots.sort(key=key, reverse=xor(reverse, self.reverse)) old_host_offset = self.host_offset with self.snapshot_lock: self.need_redraw = ( self.need_redraw or self.height > height or self.host_headers[-2] != time_header ) self._snapshots = snapshots self.host_headers[-2] = time_header self.height = height self.host_offset = max(-1, min(self.host_offset, info_length - self.width + 39)) if old_host_offset not in {self.host_offset, LARGE_INTEGER}: self.beep() if self.selection.is_set(): identity = self.selection.identity self.selection.reset() for i, process in enumerate(snapshots): if process._ident == identity: # pylint: disable=protected-access self.selection.index = i self.selection.process = process break @classmethod def set_snapshot_interval(cls, interval): assert interval > 0.0 interval = float(interval) cls.SNAPSHOT_INTERVAL = min(interval / 3.0, 1.0) cls.take_snapshots = ttl_cache(ttl=interval)( cls.take_snapshots.__wrapped__, # pylint: disable=no-member ) def ensure_snapshots(self): if not self.has_snapshots: self.snapshots = self.take_snapshots() @ttl_cache(ttl=2.0) def take_snapshots(self): snapshots = GpuProcess.take_snapshots(self.processes, failsafe=True) for condition in self.filters: snapshots = filter(condition, snapshots) snapshots = list(snapshots) time_length = max(4, max((len(p.running_time_human) for p in snapshots), default=4)) for snapshot in snapshots: snapshot.host_info = WideString( '{:>5} {:>5} {} {}'.format( snapshot.cpu_percent_string.replace('%', ''), snapshot.memory_percent_string.replace('%', ''), ' ' * (time_length - len(snapshot.running_time_human)) + snapshot.running_time_human, snapshot.command, ), ) with self.snapshot_lock: self._snapshot_buffer = snapshots return snapshots def _snapshot_target(self): self._daemon_running.wait() while self._daemon_running.is_set(): self.take_snapshots() time.sleep(self.SNAPSHOT_INTERVAL) def header_lines(self): header = [ '╒' + '═' * (self.width - 2) + '╕', '│ {} │'.format('Processes:'.ljust(self.width - 4)), '│ GPU PID USER GPU-MEM %SM {} │'.format( ' '.join(self.host_headers).ljust(self.width - 40), ), '╞' + '═' * (self.width - 2) + '╡', ] if len(self.snapshots) == 0: if self.has_snapshots: message = ' No running processes found{} '.format(' (in WSL)' if host.WSL else '') else: message = ' Gathering process status...' header.extend( [f'│ {message.ljust(self.width - 4)} │', '╘' + '═' * (self.width - 2) + '╛'], ) return header @property def processes(self): return list( itertools.chain.from_iterable(device.processes().values() for device in self.devices), ) def poke(self): if not self._daemon_running.is_set(): self._daemon_running.set() self._snapshot_daemon.start() self.snapshots = self._snapshot_buffer self.selection.within_window = False if len(self.snapshots) > 0 and self.selection.is_set(): y = self.y + 5 prev_device_index = None for process in self.snapshots: device_index = process.device.physical_index if prev_device_index != device_index: if not self.compact and prev_device_index is not None: y += 1 prev_device_index = device_index if self.selection.is_same(process): self.selection.within_window = ( self.root.y <= y < self.root.termsize[0] and self.width >= 79 ) if not self.selection.within_window: if y < self.root.y: self.parent.y += self.root.y - y elif y >= self.root.termsize[0]: self.parent.y -= y - self.root.termsize[0] + 1 self.parent.update_size(self.root.termsize) self.need_redraw = True break y += 1 super().poke() def draw(self): # pylint: disable=too-many-locals,too-many-branches,too-many-statements self.color_reset() if self.need_redraw: if SUPERUSER: self.addstr(self.y, self.x + 1, '!CAUTION: SUPERUSER LOGGED-IN.') self.color_at(self.y, self.x + 1, width=1, fg='red', attr='blink') self.color_at(self.y, self.x + 2, width=29, fg='yellow', attr='italic') for y, line in enumerate(self.header_lines(), start=self.y + 1): self.addstr(y, self.x, line) context_width = wcslen(USERCONTEXT) if not host.WINDOWS or len(USERCONTEXT) == context_width: # Do not support windows-curses with wide characters username_width = wcslen(USERNAME) hostname_width = wcslen(HOSTNAME) offset = self.x + self.width - context_width - 2 self.addstr(self.y + 2, self.x + offset, USERCONTEXT) self.color_at(self.y + 2, self.x + offset, width=context_width, attr='bold') self.color_at( self.y + 2, self.x + offset, width=username_width, fg=('yellow' if SUPERUSER else 'magenta'), attr='bold', ) self.color_at( self.y + 2, self.x + offset + username_width + 1, width=hostname_width, fg='green', attr='bold', ) self.addstr(self.y + 3, self.x + 1, ' GPU PID USER GPU-MEM %SM ') host_offset = max(self.host_offset, 0) command_offset = max(14 + len(self.host_headers[-2]) - host_offset, 0) if command_offset > 0: host_headers = ' '.join(self.host_headers) self.addstr( self.y + 3, self.x + 38, f'{host_headers[host_offset:].ljust(self.width - 40)}', ) else: self.addstr(self.y + 3, self.x + 38, '{}'.format('COMMAND'.ljust(self.width - 40))) _, reverse, offset, column, *_ = self.ORDERS[self.order] column_width = len(column) reverse = xor(reverse, self.reverse) indicator = '▼' if reverse else '▲' if self.order in {'cpu_percent', 'memory_percent', 'time'}: offset -= host_offset if self.order == 'time': offset += len(self.host_headers[-2]) - 4 if offset > 38 or host_offset == 0: self.addstr(self.y + 3, self.x + offset - 1, column + indicator) self.color_at( self.y + 3, self.x + offset - 1, width=column_width, attr='bold | underline', ) elif offset <= 38 < offset + column_width: self.addstr(self.y + 3, self.x + 38, (column + indicator)[39 - offset :]) if offset + column_width >= 40: self.color_at( self.y + 3, self.x + 38, width=offset + column_width - 39, attr='bold | underline', ) if offset + column_width >= 39: self.color_at(self.y + 3, self.x + offset + column_width - 1, width=1, attr='bold') elif self.order == 'natural' and not reverse: self.color_at(self.y + 3, self.x + 2, width=3, attr='bold') else: self.addstr(self.y + 3, self.x + offset - 1, column + indicator) self.color_at( self.y + 3, self.x + offset - 1, width=column_width, attr='bold | underline', ) self.color_at(self.y + 3, self.x + offset + column_width - 1, width=1, attr='bold') hint = True if self.y_mouse is not None: self.selection.reset() hint = False self.selection.within_window = False if len(self.snapshots) > 0: y = self.y + 5 prev_device_index = None prev_device_display_index = None color = -1 for process in self.snapshots: device_index = process.device.physical_index device_display_index = process.device.display_index if prev_device_index != device_index: if not self.compact and prev_device_index is not None: self.addstr(y, self.x, '├' + '─' * (self.width - 2) + '┤') if y == self.y_mouse: self.y_mouse += 1 y += 1 prev_device_index = device_index if prev_device_display_index != device_display_index: color = process.device.snapshot.display_color prev_device_display_index = device_display_index host_info = process.host_info if self.host_offset < 0: host_info = cut_string(host_info, padstr='..', maxlen=self.width - 39) else: host_info = WideString(host_info)[self.host_offset :] self.addstr( y, self.x, '│{:>4} {:>7} {} {} {:>8} {:>3} {} │'.format( device_display_index, cut_string(process.pid, maxlen=7, padstr='.'), process.type, str( WideString(cut_string(process.username, maxlen=7, padstr='+')).rjust(7), ), process.gpu_memory_human, process.gpu_sm_utilization_string.replace('%', ''), WideString(host_info).ljust(self.width - 39)[: self.width - 39], ), ) if self.host_offset > 0: self.addstr(y, self.x + 37, ' ') is_zombie = process.is_zombie no_permissions = process.no_permissions is_gone = process.is_gone if (is_zombie or no_permissions or is_gone) and command_offset == 0: self.addstr(y, self.x + 38, process.command) if y == self.y_mouse: self.selection.process = process hint = True if self.selection.is_same(process): self.color_at( y, self.x + 1, width=self.width - 2, fg='yellow' if self.selection.is_tagged(process) else 'cyan', attr='bold | reverse', ) self.selection.within_window = ( self.root.y <= y < self.root.termsize[0] and self.width >= 79 ) else: owned = str(process.username) == USERNAME or SUPERUSER if self.selection.is_same_on_host(process): self.addstr(y, self.x + 1, '=', self.get_fg_bg_attr(attr='bold | blink')) self.color_at(y, self.x + 2, width=3, fg=color) if self.selection.is_tagged(process): self.color_at( y, self.x + 5, width=self.width - 6, fg='yellow', attr='bold' if owned else 'bold | dim', ) elif not owned: self.color_at(y, self.x + 5, width=self.width - 6, attr='dim') if is_zombie or no_permissions: self.color_at(y, self.x + 38 + command_offset, width=14, fg='yellow') elif is_gone: self.color_at(y, self.x + 38 + command_offset, width=15, fg='red') y += 1 self.addstr(y, self.x, '╘' + '═' * (self.width - 2) + '╛') if not hint: self.selection.clear() elif self.has_snapshots: message = ' No running processes found{} '.format(' (in WSL)' if host.WSL else '') self.addstr(self.y + 5, self.x, f'│ {message.ljust(self.width - 4)} │') text_offset = self.x + self.width - 47 if len(self.selection.tagged) > 0 or ( self.selection.owned() and self.selection.within_window ): self.addstr(self.y, text_offset, '(Press ^C(INT)/T(TERM)/K(KILL) to send signals)') self.color_at(self.y, text_offset + 7, width=2, fg='magenta', attr='bold | italic') self.color_at(self.y, text_offset + 10, width=3, fg='red', attr='bold') self.color_at(self.y, text_offset + 15, width=1, fg='magenta', attr='bold | italic') self.color_at(self.y, text_offset + 17, width=4, fg='red', attr='bold') self.color_at(self.y, text_offset + 23, width=1, fg='magenta', attr='bold | italic') self.color_at(self.y, text_offset + 25, width=4, fg='red', attr='bold') else: self.addstr(self.y, text_offset, ' ' * 47) def finalize(self): self.y_mouse = None super().finalize() def destroy(self): super().destroy() self._daemon_running.clear() def print_width(self): self.ensure_snapshots() return min( self.width, max((39 + len(process.host_info) for process in self.snapshots), default=79), ) def print(self): self.ensure_snapshots() lines = ['', *self.header_lines()] lines[2] = ''.join( ( lines[2][: -2 - wcslen(USERCONTEXT)], colored(USERNAME, color=('yellow' if SUPERUSER else 'magenta'), attrs=('bold',)), colored('@', attrs=('bold',)), colored(HOSTNAME, color='green', attrs=('bold',)), lines[2][-2:], ), ) if len(self.snapshots) > 0: key, reverse, *_ = self.ORDERS['natural'] self.snapshots.sort(key=key, reverse=reverse) prev_device_index = None prev_device_display_index = None color = None for process in self.snapshots: device_index = process.device.physical_index device_display_index = process.device.display_index if prev_device_index != device_index: if prev_device_index is not None: lines.append('├' + '─' * (self.width - 2) + '┤') prev_device_index = device_index if prev_device_display_index != device_display_index: color = process.device.snapshot.display_color prev_device_display_index = device_display_index host_info = cut_string(process.host_info, padstr='..', maxlen=self.width - 39) info = '{:>7} {} {} {:>8} {:>3} {}'.format( cut_string(process.pid, maxlen=7, padstr='.'), process.type, str(WideString(cut_string(process.username, maxlen=7, padstr='+')).rjust(7)), process.gpu_memory_human, process.gpu_sm_utilization_string.replace('%', ''), WideString(host_info).ljust(self.width - 39)[: self.width - 39], ) if process.is_zombie or process.no_permissions or process.is_gone: info = info.split(process.command) if process.username != USERNAME and not SUPERUSER: info = (colored(item, attrs=('dark',)) for item in info) info = colored( process.command, color=('red' if process.is_gone else 'yellow'), ).join(info) elif process.username != USERNAME and not SUPERUSER: info = colored(info, attrs=('dark',)) lines.append('│{} {} │'.format(colored(f'{device_display_index:>4}', color), info)) lines.append('╘' + '═' * (self.width - 2) + '╛') lines = '\n'.join(lines) if self.ascii: lines = lines.translate(self.ASCII_TRANSTABLE) try: print(lines) except UnicodeError: print(lines.translate(self.ASCII_TRANSTABLE)) def press(self, key): self.root.keymaps.use_keymap('process') self.root.press(key) def click(self, event): if event.pressed(1) or event.pressed(3) or event.clicked(1) or event.clicked(3): self.y_mouse = event.y return True direction = event.wheel_direction() if event.shift(): self.host_offset += 2 * direction else: self.selection.move(direction=direction) return True def __contains__(self, item): if self.parent.visible and isinstance(item, MouseEvent): return True return super().__contains__(item) nvitop-1.4.2/nvitop/tui/screens/metrics.py000066400000000000000000000532301474547113600206560ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring # pylint: disable=invalid-name import itertools import threading import time from collections import OrderedDict from nvitop.tui.library import ( HOSTNAME, NA, SUPERUSER, USERCONTEXT, USERNAME, BufferedHistoryGraph, Displayable, GpuProcess, Selection, WideString, bytes2human, cut_string, host, wcslen, ) def get_yticks(history, y_offset): # pylint: disable=too-many-branches,too-many-locals height = history.height baseline = history.baseline bound = history.bound max_bound = history.max_bound scale = history.scale upsidedown = history.upsidedown def p2h_f(p): return 0.01 * scale * p * (max_bound - baseline) * (height - 1) / (bound - baseline) max_height = height - 2 percentages = (1, 2, 4, 5, 8, 10, 20, 40, 50, 80, 100, 200, 400, 500, 800, 1000) h2p = {} p2h = {} h2e = {} for p in percentages: h_f = p2h_f(p) p2h[p] = h = int(h_f) if h not in h2p: if h < max_height: h2p[h] = p h2e[h] = abs(h_f - h) / p elif abs(h_f - h) / p < h2e[h]: h2p[h] = p h2e[h] = abs(h_f - h) / p h2p = sorted(h2p.items()) ticks = [] if len(h2p) >= 2: (hm1, pm1), (h2, p2) = h2p[-2:] if height < 12: ticks = [(hm1, pm1)] if h2e[hm1] < h2e[h2] else [(h2, p2)] else: ticks = [(h2, p2)] if p2 % 2 == 0: p1 = p2 // 2 h1 = int(p2h_f(p1)) p3 = 3 * p1 h3 = int(p2h_f(p3)) if p1 >= 3: ticks.append((h1, p1)) if h2 < h3 < max_height: ticks.append((h3, p3)) else: ticks = list(h2p) if not upsidedown: ticks = [(height - 1 - h, p) for h, p in ticks] return [(h + y_offset, p) for h, p in ticks] class ProcessMetricsScreen(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'process-metrics' SNAPSHOT_INTERVAL = 0.5 def __init__(self, win, root): super().__init__(win, root) self.selection = Selection(panel=self) self.used_gpu_memory = None self.gpu_sm_utilization = None self.cpu_percent = None self.used_host_memory = None self.enabled = False self.snapshot_lock = threading.Lock() self._snapshot_daemon = threading.Thread( name='process-metrics-snapshot-daemon', target=self._snapshot_target, daemon=True, ) self._daemon_running = threading.Event() self.x, self.y = root.x, root.y self.width, self.height = root.width, root.height self.left_width = max(20, (self.width - 3) // 2) self.right_width = max(20, (self.width - 2) // 2) self.upper_height = max(5, (self.height - 5 - 3) // 2) self.lower_height = max(5, (self.height - 5 - 2) // 2) @property def visible(self): return self._visible @visible.setter def visible(self, value): if self._visible != value: self.need_redraw = True self._visible = value if self.visible: self._daemon_running.set() try: self._snapshot_daemon.start() except RuntimeError: pass self.take_snapshots() else: self.focused = False def enable(self, state=True): if not self.selection.is_set() or not state: self.disable() return total_host_memory = host.virtual_memory().total total_host_memory_human = bytes2human(total_host_memory) total_gpu_memory = self.process.device.memory_total() total_gpu_memory_human = bytes2human(total_gpu_memory) def format_cpu_percent(value): if value is NA: return f'CPU: {value}' return f'CPU: {value:.1f}%' def format_max_cpu_percent(value): if value is NA: return f'MAX CPU: {value}' return f'MAX CPU: {value:.1f}%' def format_host_memory(value): if value is NA: return f'HOST-MEM: {value}' return 'HOST-MEM: {} ({:.1f}%)'.format( # noqa: UP032 bytes2human(value), round(100.0 * value / total_host_memory, 1), ) def format_max_host_memory(value): if value is NA: return f'MAX HOST-MEM: {value}' return 'MAX HOST-MEM: {} ({:.1f}%) / {}'.format( # noqa: UP032 bytes2human(value), round(100.0 * value / total_host_memory, 1), total_host_memory_human, ) def format_gpu_memory(value): if value is not NA and total_gpu_memory is not NA: return 'GPU-MEM: {} ({:.1f}%)'.format( # noqa: UP032 bytes2human(value), round(100.0 * value / total_gpu_memory, 1), ) return f'GPU-MEM: {value}' def format_max_gpu_memory(value): if value is not NA and total_gpu_memory is not NA: return 'MAX GPU-MEM: {} ({:.1f}%) / {}'.format( # noqa: UP032 bytes2human(value), round(100.0 * value / total_gpu_memory, 1), total_gpu_memory_human, ) return f'MAX GPU-MEM: {value}' def format_sm(value): if value is NA: return f'GPU-SM: {value}' return f'GPU-SM: {value:.1f}%' def format_max_sm(value): if value is NA: return f'MAX GPU-SM: {value}' return f'MAX GPU-SM: {value:.1f}%' with self.snapshot_lock: self.cpu_percent = BufferedHistoryGraph( interval=1.0, upperbound=1000.0, width=self.left_width, height=self.upper_height, baseline=0.0, upsidedown=False, dynamic_bound=True, min_bound=10.0, init_bound=100.0, format=format_cpu_percent, max_format=format_max_cpu_percent, ) self.used_host_memory = BufferedHistoryGraph( interval=1.0, upperbound=total_host_memory, width=self.left_width, height=self.lower_height, baseline=0.0, upsidedown=True, dynamic_bound=True, format=format_host_memory, max_format=format_max_host_memory, ) self.used_gpu_memory = BufferedHistoryGraph( interval=1.0, upperbound=total_gpu_memory or 1, width=self.right_width, height=self.upper_height, baseline=0.0, upsidedown=False, dynamic_bound=True, format=format_gpu_memory, max_format=format_max_gpu_memory, ) self.gpu_sm_utilization = BufferedHistoryGraph( interval=1.0, upperbound=100.0, width=self.right_width, height=self.lower_height, baseline=0.0, upsidedown=True, dynamic_bound=True, format=format_sm, max_format=format_max_sm, ) self.cpu_percent.scale = 0.1 self.used_host_memory.scale = 1.0 self.used_gpu_memory.scale = 1.0 self.gpu_sm_utilization.scale = 1.0 self._daemon_running.set() try: self._snapshot_daemon.start() except RuntimeError: pass self.enabled = True self.take_snapshots() self.update_size() def disable(self): with self.snapshot_lock: self._daemon_running.clear() self.enabled = False self.cpu_percent = None self.used_host_memory = None self.used_gpu_memory = None self.gpu_sm_utilization = None @property def process(self): return self.selection.process @process.setter def process(self, value): self.selection.process = value self.enable() @classmethod def set_snapshot_interval(cls, interval): assert interval > 0.0 interval = float(interval) cls.SNAPSHOT_INTERVAL = min(interval / 3.0, 1.0) def take_snapshots(self): with self.snapshot_lock: if not self.selection.is_set() or not self.enabled: return with GpuProcess.failsafe(): self.process.device.as_snapshot() self.process.update_gpu_status() snapshot = self.process.as_snapshot() self.cpu_percent.add(snapshot.cpu_percent) self.used_host_memory.add(snapshot.host_memory) self.used_gpu_memory.add(snapshot.gpu_memory) self.gpu_sm_utilization.add(snapshot.gpu_sm_utilization) def _snapshot_target(self): while True: self._daemon_running.wait() self.take_snapshots() time.sleep(self.SNAPSHOT_INTERVAL) def update_size(self, termsize=None): n_term_lines, n_term_cols = termsize = super().update_size(termsize=termsize) self.width = n_term_cols - self.x self.height = n_term_lines - self.y self.left_width = max(20, (self.width - 3) // 2) self.right_width = max(20, (self.width - 2) // 2) self.upper_height = max(5, (self.height - 8) // 2) self.lower_height = max(5, (self.height - 7) // 2) self.need_redraw = True with self.snapshot_lock: if self.enabled: self.cpu_percent.graph_size = (self.left_width, self.upper_height) self.used_host_memory.graph_size = (self.left_width, self.lower_height) self.used_gpu_memory.graph_size = (self.right_width, self.upper_height) self.gpu_sm_utilization.graph_size = (self.right_width, self.lower_height) return termsize def frame_lines(self): line = '│' + ' ' * self.left_width + '│' + ' ' * self.right_width + '│' return [ '╒' + '═' * (self.width - 2) + '╕', '│ {} │'.format('Process:'.ljust(self.width - 4)), '│ {} │'.format('GPU'.ljust(self.width - 4)), '╞' + '═' * (self.width - 2) + '╡', '│' + ' ' * (self.width - 2) + '│', '╞' + '═' * self.left_width + '╤' + '═' * self.right_width + '╡', *([line] * self.upper_height), '├' + '─' * self.left_width + '┼' + '─' * self.right_width + '┤', *([line] * self.lower_height), '╘' + '═' * self.left_width + '╧' + '═' * self.right_width + '╛', ] def poke(self): if self.visible and not self._daemon_running.is_set(): self._daemon_running.set() try: self._snapshot_daemon.start() except RuntimeError: pass self.take_snapshots() super().poke() def draw(self): # pylint: disable=too-many-statements,too-many-locals,too-many-branches self.color_reset() if self.need_redraw: for y, line in enumerate(self.frame_lines(), start=self.y): self.addstr(y, self.x, line) context_width = wcslen(USERCONTEXT) if not host.WINDOWS or len(USERCONTEXT) == context_width: # Do not support windows-curses with wide characters username_width = wcslen(USERNAME) hostname_width = wcslen(HOSTNAME) offset = self.x + self.width - context_width - 2 self.addstr(self.y + 1, self.x + offset, USERCONTEXT) self.color_at(self.y + 1, self.x + offset, width=context_width, attr='bold') self.color_at( self.y + 1, self.x + offset, width=username_width, fg=('yellow' if SUPERUSER else 'magenta'), attr='bold', ) self.color_at( self.y + 1, self.x + offset + username_width + 1, width=hostname_width, fg='green', attr='bold', ) for offset, string in ( (19, '╴30s├'), (34, '╴60s├'), (65, '╴120s├'), (95, '╴180s├'), (125, '╴240s├'), (155, '╴300s├'), ): for x_offset, width in ( (self.x + 1 + self.left_width, self.left_width), (self.x + 1 + self.left_width + 1 + self.right_width, self.right_width), ): if offset > width: break self.addstr(self.y + self.upper_height + 6, x_offset - offset, string) self.color_at( self.y + self.upper_height + 6, x_offset - offset + 1, width=len(string) - 2, attr='dim', ) with self.snapshot_lock: process = self.process.snapshot columns = OrderedDict( [ (' GPU', self.process.device.display_index.rjust(4)), ('PID ', f'{str(process.pid).rjust(3)} {process.type}'), ( 'USER', WideString( cut_string( WideString(process.username).rjust(4), maxlen=32, padstr='+', ), ), ), (' GPU-MEM', process.gpu_memory_human.rjust(8)), (' %SM', str(process.gpu_sm_utilization).rjust(4)), ('%GMBW', str(process.gpu_memory_utilization).rjust(5)), ('%ENC', str(process.gpu_encoder_utilization).rjust(4)), ('%DEC', str(process.gpu_encoder_utilization).rjust(4)), (' %CPU', process.cpu_percent_string.rjust(6)), (' %MEM', process.memory_percent_string.rjust(5)), (' TIME', (' ' + process.running_time_human).rjust(5)), ], ) x = self.x + 1 header = '' fields = WideString() no_break = True for i, (col, value) in enumerate(columns.items()): width = len(value) if x + width < self.width - 2: if i == 0: header += col.rjust(width) fields += value else: header += ' ' + col.rjust(width) fields += ' ' + value x = self.x + 1 + len(fields) else: no_break = False break self.addstr(self.y + 2, self.x + 1, header.ljust(self.width - 2)) self.addstr(self.y + 4, self.x + 1, str(fields.ljust(self.width - 2))) self.color_at( self.y + 4, self.x + 1, width=4, fg=self.process.device.snapshot.display_color, ) if no_break: x = self.x + 1 + len(fields) + 2 if x + 4 < self.width - 2: self.addstr( self.y + 2, x, cut_string('COMMAND', self.width - x - 2, padstr='..').ljust( self.width - x - 2, ), ) if process.is_zombie or process.no_permissions: self.color(fg='yellow') elif process.is_gone: self.color(fg='red') self.addstr( self.y + 4, x, cut_string( WideString(process.command).ljust(self.width - x - 2), self.width - x - 2, padstr='..', ), ) self.color(fg='cyan') for y, line in enumerate(self.cpu_percent.graph, start=self.y + 6): self.addstr(y, self.x + 1, line) self.color(fg='magenta') for y, line in enumerate( self.used_host_memory.graph, start=self.y + self.upper_height + 7, ): self.addstr(y, self.x + 1, line) if self.TERM_256COLOR: scale = (self.used_gpu_memory.bound / self.used_gpu_memory.max_bound) / ( self.upper_height - 1 ) for i, (y, line) in enumerate( enumerate(self.used_gpu_memory.graph, start=self.y + 6), ): self.addstr( y, self.x + self.left_width + 2, line, self.get_fg_bg_attr(fg=(self.upper_height - i - 1) * scale), ) scale = (self.gpu_sm_utilization.bound / self.gpu_sm_utilization.max_bound) / ( self.lower_height - 1 ) for i, (y, line) in enumerate( enumerate(self.gpu_sm_utilization.graph, start=self.y + self.upper_height + 7), ): self.addstr( y, self.x + self.left_width + 2, line, self.get_fg_bg_attr(fg=i * scale), ) else: self.color(fg=self.process.device.snapshot.memory_display_color) for y, line in enumerate(self.used_gpu_memory.graph, start=self.y + 6): self.addstr(y, self.x + self.left_width + 2, line) self.color(fg=self.process.device.snapshot.gpu_display_color) for y, line in enumerate( self.gpu_sm_utilization.graph, start=self.y + self.upper_height + 7, ): self.addstr(y, self.x + self.left_width + 2, line) self.color_reset() self.addstr(self.y + 6, self.x + 1, f' {self.cpu_percent.max_value_string()} ') self.addstr(self.y + 7, self.x + 5, f' {self.cpu_percent} ') self.addstr( self.y + self.upper_height + self.lower_height + 5, self.x + 5, f' {self.used_host_memory} ', ) self.addstr( self.y + self.upper_height + self.lower_height + 6, self.x + 1, ' {} '.format( cut_string( self.used_host_memory.max_value_string(), maxlen=self.left_width - 2, padstr='..', ), ), ) self.addstr( self.y + 6, self.x + self.left_width + 2, ' {} '.format( cut_string( self.used_gpu_memory.max_value_string(), maxlen=self.right_width - 2, padstr='..', ), ), ) self.addstr(self.y + 7, self.x + self.left_width + 6, f' {self.used_gpu_memory} ') self.addstr( self.y + self.upper_height + self.lower_height + 5, self.x + self.left_width + 6, f' {self.gpu_sm_utilization} ', ) self.addstr( self.y + self.upper_height + self.lower_height + 6, self.x + self.left_width + 2, f' {self.gpu_sm_utilization.max_value_string()} ', ) for y in range(self.y + 6, self.y + 6 + self.upper_height): self.addstr(y, self.x, '│') self.addstr(y, self.x + self.left_width + 1, '│') for y in range( self.y + self.upper_height + 7, self.y + self.upper_height + self.lower_height + 7, ): self.addstr(y, self.x, '│') self.addstr(y, self.x + self.left_width + 1, '│') self.color(attr='dim') for y, p in itertools.chain( get_yticks(self.cpu_percent, self.y + 6), get_yticks(self.used_host_memory, self.y + self.upper_height + 7), ): self.addstr(y, self.x, f'├╴{p}% ') self.color_at(y, self.x, width=2, attr=0) x = self.x + self.left_width + 1 for y, p in itertools.chain( get_yticks(self.used_gpu_memory, self.y + 6), get_yticks(self.gpu_sm_utilization, self.y + self.upper_height + 7), ): self.addstr(y, x, f'├╴{p}% ') self.color_at(y, x, width=2, attr=0) def destroy(self): super().destroy() self._daemon_running.clear() def press(self, key): self.root.keymaps.use_keymap('process-metrics') self.root.press(key) nvitop-1.4.2/nvitop/tui/screens/treeview.py000066400000000000000000000500021474547113600210340ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import threading import time from collections import deque from functools import partial from itertools import islice from nvitop.tui.library import ( NA, SUPERUSER, USERNAME, Displayable, HostProcess, Selection, Snapshot, WideString, host, send_signal, ttl_cache, ) class TreeNode: # pylint: disable=too-many-instance-attributes def __init__(self, process, children=()): self.process = process self.parent = None self.children = [] self.devices = set() self.children_set = set() self.is_root = True self.is_last = False self.prefix = '' for child in children: self.add(child) def add(self, child): if child in self.children_set: return self.children.append(child) self.children_set.add(child) child.parent = self child.is_root = False def __getattr__(self, name): try: return super().__getattr__(name) except AttributeError: return getattr(self.process, name) def __eq__(self, other): return self.process._ident == other.process._ident # pylint: disable=protected-access def __hash__(self): return hash(self.process) def as_snapshot(self): # pylint: disable=too-many-branches,too-many-statements if not isinstance(self.process, Snapshot): with self.process.oneshot(): try: username = self.process.username() except host.PsutilError: username = NA try: command = self.process.command() if len(command) == 0: command = 'Zombie Process' except host.AccessDenied: command = 'No Permissions' except host.PsutilError: command = 'No Such Process' try: cpu_percent = self.process.cpu_percent() except host.PsutilError: cpu_percent = cpu_percent_string = NA else: if cpu_percent is NA: cpu_percent_string = NA elif cpu_percent < 1000.0: cpu_percent_string = f'{cpu_percent:.1f}%' elif cpu_percent < 10000: cpu_percent_string = f'{int(cpu_percent)}%' else: cpu_percent_string = '9999+%' try: memory_percent = self.process.memory_percent() except host.PsutilError: memory_percent = memory_percent_string = NA else: if memory_percent is not NA: memory_percent_string = f'{memory_percent:.1f}%' else: memory_percent_string = NA try: num_threads = self.process.num_threads() except host.PsutilError: num_threads = NA try: running_time_human = self.process.running_time_human() except host.PsutilError: running_time_human = NA self.process = Snapshot( real=self.process, pid=self.process.pid, username=username, command=command, cpu_percent=cpu_percent, cpu_percent_string=cpu_percent_string, memory_percent=memory_percent, memory_percent_string=memory_percent_string, num_threads=num_threads, running_time_human=running_time_human, ) if len(self.children) > 0: for child in self.children: child.as_snapshot() self.children.sort( key=lambda node: ( node._gone, # pylint: disable=protected-access node.username, node.pid, ), ) for child in self.children: child.is_last = False self.children[-1].is_last = True def set_prefix(self, prefix=''): if self.is_root: self.prefix = '' else: self.prefix = prefix + ('└─ ' if self.is_last else '├─ ') prefix += ' ' if self.is_last else '│ ' for child in self.children: child.set_prefix(prefix) @classmethod def merge(cls, leaves): # pylint: disable=too-many-branches nodes = {} for process in leaves: if isinstance(process, Snapshot): process = process.real try: node = nodes[process.pid] except KeyError: node = nodes[process.pid] = cls(process) finally: try: node.devices.add(process.device) except AttributeError: pass queue = deque(nodes.values()) while len(queue) > 0: node = queue.popleft() try: with node.process.oneshot(): parent_process = node.process.parent() except host.PsutilError: continue if parent_process is None: continue try: parent = nodes[parent_process.pid] except KeyError: parent = nodes[parent_process.pid] = cls(parent_process) queue.append(parent) else: continue finally: parent.add(node) cpid_map = host.reverse_ppid_map() for process in leaves: if isinstance(process, Snapshot): process = process.real node = nodes[process.pid] for cpid in cpid_map.get(process.pid, []): if cpid not in nodes: nodes[cpid] = child = cls(HostProcess(cpid)) node.add(child) return sorted(filter(lambda node: node.is_root, nodes.values()), key=lambda node: node.pid) @staticmethod def freeze(roots): for root in roots: root.as_snapshot() root.set_prefix() return roots @staticmethod def flatten(roots): flattened = [] stack = list(reversed(roots)) while len(stack) > 0: top = stack.pop() flattened.append(top) stack.extend(reversed(top.children)) return flattened class TreeViewScreen(Displayable): # pylint: disable=too-many-instance-attributes NAME = 'treeview' SNAPSHOT_INTERVAL = 0.5 def __init__(self, win, root): super().__init__(win, root) self.selection = Selection(panel=self) self.x_offset = 0 self.y_mouse = None self._snapshot_buffer = [] self._snapshots = [] self.snapshot_lock = threading.Lock() self._snapshot_daemon = threading.Thread( name='treeview-snapshot-daemon', target=self._snapshot_target, daemon=True, ) self._daemon_running = threading.Event() self.x, self.y = root.x, root.y self.scroll_offset = 0 self.width, self.height = root.width, root.height @property def display_height(self): return self.height - 1 @property def visible(self): return self._visible @visible.setter def visible(self, value): if self._visible != value: self.need_redraw = True self._visible = value if self.visible: self._daemon_running.set() try: self._snapshot_daemon.start() except RuntimeError: pass self.snapshots = self.take_snapshots() else: self._daemon_running.clear() self.focused = False @property def snapshots(self): return self._snapshots @snapshots.setter def snapshots(self, snapshots): with self.snapshot_lock: self.need_redraw = self.need_redraw or len(self._snapshots) > len(snapshots) self._snapshots = snapshots if self.selection.is_set(): identity = self.selection.identity self.selection.reset() for i, process in enumerate(snapshots): if process._ident[:2] == identity[:2]: # pylint: disable=protected-access self.selection.index = i self.selection.process = process break @classmethod def set_snapshot_interval(cls, interval): assert interval > 0.0 interval = float(interval) cls.SNAPSHOT_INTERVAL = min(interval / 3.0, 1.0) cls.take_snapshots = ttl_cache(ttl=interval)( cls.take_snapshots.__wrapped__, # pylint: disable=no-member ) @ttl_cache(ttl=2.0) def take_snapshots(self): self.root.main_screen.process_panel.ensure_snapshots() snapshots = ( self.root.main_screen.process_panel._snapshot_buffer # pylint: disable=protected-access ) roots = TreeNode.merge(snapshots) roots = TreeNode.freeze(roots) nodes = TreeNode.flatten(roots) snapshots = [] for node in nodes: snapshot = node.process snapshot.username = WideString(snapshot.username) snapshot.prefix = node.prefix if len(node.devices) > 0: snapshot.devices = 'GPU ' + ','.join( dev.display_index for dev in sorted(node.devices, key=lambda device: device.tuple_index) ) else: snapshot.devices = 'Host' snapshots.append(snapshot) with self.snapshot_lock: self._snapshot_buffer = snapshots return snapshots def _snapshot_target(self): while True: self._daemon_running.wait() self.take_snapshots() time.sleep(self.SNAPSHOT_INTERVAL) def update_size(self, termsize=None): n_term_lines, n_term_cols = termsize = super().update_size(termsize=termsize) self.width = n_term_cols - self.x self.height = n_term_lines - self.y return termsize def poke(self): if self._daemon_running.is_set(): self.snapshots = self._snapshot_buffer self.selection.within_window = False if len(self.snapshots) > 0 and self.selection.is_set(): for i, process in enumerate(self.snapshots): y = self.y + 1 - self.scroll_offset + i if self.selection.is_same_on_host(process): self.selection.within_window = ( 1 <= y - self.y < self.height and self.width >= 79 ) if not self.selection.within_window: if y < self.y + 1: self.scroll_offset -= self.y + 1 - y elif y >= self.y + self.height: self.scroll_offset += y - self.y - self.height + 1 self.scroll_offset = max( min(len(self.snapshots) - self.display_height, self.scroll_offset), 0, ) break else: self.scroll_offset = 0 super().poke() def draw(self): # pylint: disable=too-many-statements,too-many-locals self.color_reset() pid_width = max(3, max((len(str(process.pid)) for process in self.snapshots), default=3)) username_width = max( 4, max((len(process.username) for process in self.snapshots), default=4), ) device_width = max(6, max((len(process.devices) for process in self.snapshots), default=6)) num_threads_width = max( 4, max((len(str(process.num_threads)) for process in self.snapshots), default=4), ) time_width = max( 4, max((len(process.running_time_human) for process in self.snapshots), default=4), ) header = ' '.join( [ 'PID'.rjust(pid_width), 'USER'.ljust(username_width), 'DEVICE'.rjust(device_width), 'NLWP'.rjust(num_threads_width), '%CPU', '%MEM', 'TIME'.rjust(time_width), 'COMMAND', ], ) command_offset = len(header) - 7 if self.x_offset < command_offset: self.addstr( self.y, self.x, header[self.x_offset : self.x_offset + self.width].ljust(self.width), ) else: self.addstr(self.y, self.x, 'COMMAND'.ljust(self.width)) self.color_at(self.y, self.x, width=self.width, fg='cyan', attr='bold | reverse') if len(self.snapshots) == 0: self.addstr( self.y + 1, self.x, 'No running GPU processes found{}.'.format(' (in WSL)' if host.WSL else ''), ) return hint = True if self.y_mouse is not None: self.selection.reset() hint = False self.selection.within_window = False processes = islice( self.snapshots, self.scroll_offset, self.scroll_offset + self.display_height, ) for y, process in enumerate(processes, start=self.y + 1): prefix_length = len(process.prefix) line = '{} {} {} {} {:>5} {:>5} {} {}{}'.format( str(process.pid).rjust(pid_width), process.username.ljust(username_width), process.devices.rjust(device_width), str(process.num_threads).rjust(num_threads_width), process.cpu_percent_string.replace('%', ''), process.memory_percent_string.replace('%', ''), process.running_time_human.rjust(time_width), process.prefix, process.command, ) line = str(WideString(line)[self.x_offset :].ljust(self.width)[: self.width]) self.addstr(y, self.x, line) prefix_length -= max(0, self.x_offset - command_offset) if prefix_length > 0: self.color_at( y, self.x + max(0, command_offset - self.x_offset), width=prefix_length, fg='green', attr='bold', ) if y == self.y_mouse: self.selection.process = process hint = True owned = str(process.username) == USERNAME or SUPERUSER if self.selection.is_same_on_host(process): self.color_at( y, self.x, width=self.width, fg='yellow' if self.selection.is_tagged(process) else 'green', attr='bold | reverse', ) self.selection.within_window = 1 <= y - self.y < self.height and self.width >= 79 elif self.selection.is_tagged(process): self.color_at( y, self.x, width=self.width, fg='yellow', attr='bold' if owned else 'bold | dim', ) elif not owned: self.color_at(y, self.x, width=self.width, attr='dim') if not hint: self.selection.clear() self.color(fg='cyan', attr='bold | reverse') text_offset = self.x + self.width - 47 if len(self.selection.tagged) > 0 or ( self.selection.owned() and self.selection.within_window ): self.addstr(self.y, text_offset - 1, ' (Press ^C(INT)/T(TERM)/K(KILL) to send signals)') self.color_at( self.y, text_offset + 7, width=2, fg='cyan', bg='yellow', attr='bold | italic | reverse', ) self.color_at( self.y, text_offset + 10, width=3, fg='cyan', bg='red', attr='bold | reverse', ) self.color_at( self.y, text_offset + 15, width=1, fg='cyan', bg='yellow', attr='bold | italic | reverse', ) self.color_at( self.y, text_offset + 17, width=4, fg='cyan', bg='red', attr='bold | reverse', ) self.color_at( self.y, text_offset + 23, width=1, fg='cyan', bg='yellow', attr='bold | italic | reverse', ) self.color_at( self.y, text_offset + 25, width=4, fg='cyan', bg='red', attr='bold | reverse', ) def finalize(self): self.y_mouse = None super().finalize() def destroy(self): super().destroy() self._daemon_running.clear() def press(self, key): self.root.keymaps.use_keymap('treeview') self.root.press(key) def click(self, event): if event.pressed(1) or event.pressed(3) or event.clicked(1) or event.clicked(3): self.y_mouse = event.y return True direction = event.wheel_direction() if event.shift(): self.x_offset = max(0, self.x_offset + 2 * direction) else: self.selection.move(direction=direction) return True def init_keybindings(self): def tree_left(): self.x_offset = max(0, self.x_offset - 5) def tree_right(): self.x_offset += 5 def tree_begin(): self.x_offset = 0 def select_move(direction): self.selection.move(direction=direction) def select_clear(): self.selection.clear() def tag(): self.selection.tag() select_move(direction=+1) keymaps = self.root.keymaps keymaps.bind('treeview', '', tree_left) keymaps.copy('treeview', '', '') keymaps.bind('treeview', '', tree_right) keymaps.copy('treeview', '', '') keymaps.bind('treeview', '', tree_begin) keymaps.copy('treeview', '', '^') keymaps.bind('treeview', '', partial(select_move, direction=-1)) keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', '[') keymaps.bind('treeview', '', partial(select_move, direction=+1)) keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', '') keymaps.copy('treeview', '', ']') keymaps.bind('treeview', '', partial(select_move, direction=-(1 << 20))) keymaps.bind('treeview', '', partial(select_move, direction=+(1 << 20))) keymaps.bind('treeview', '', select_clear) keymaps.bind('treeview', '', tag) keymaps.bind('treeview', 'T', partial(send_signal, signal='terminate', panel=self)) keymaps.bind('treeview', 'K', partial(send_signal, signal='kill', panel=self)) keymaps.copy('treeview', 'K', 'k') keymaps.bind('treeview', '', partial(send_signal, signal='interrupt', panel=self)) keymaps.copy('treeview', '', 'I') nvitop-1.4.2/nvitop/tui/tui.py000066400000000000000000000301571474547113600163520ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. # pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring import curses import shutil import time from nvitop.tui.library import ALT_KEY, DisplayableContainer, KeyBuffer, KeyMaps, MouseEvent from nvitop.tui.screens import ( BreakLoop, EnvironScreen, HelpScreen, MainScreen, ProcessMetricsScreen, TreeViewScreen, ) class TUI(DisplayableContainer): # pylint: disable=too-many-instance-attributes # pylint: disable-next=too-many-arguments def __init__( self, devices, filters=(), *, ascii=False, # pylint: disable=redefined-builtin mode='auto', interval=None, win=None, ): super().__init__(win, root=self) self.x = self.y = 0 self.width = max(79, shutil.get_terminal_size(fallback=(79, 24)).columns - self.x) self.termsize = None self.ascii = ascii self.devices = devices self.device_count = len(self.devices) self.main_screen = MainScreen( self.devices, filters, ascii=ascii, mode=mode, win=win, root=self, ) self.main_screen.visible = True self.main_screen.focused = False self.add_child(self.main_screen) self.current_screen = self.previous_screen = self.main_screen self._messagebox = None if win is not None: self.environ_screen = EnvironScreen(win=win, root=self) self.environ_screen.visible = False self.environ_screen.ascii = False self.add_child(self.environ_screen) self.treeview_screen = TreeViewScreen(win=win, root=self) self.treeview_screen.visible = False self.treeview_screen.ascii = self.ascii self.add_child(self.treeview_screen) self.process_metrics_screen = ProcessMetricsScreen(win=win, root=self) self.process_metrics_screen.visible = False self.process_metrics_screen.ascii = self.ascii self.add_child(self.process_metrics_screen) self.help_screen = HelpScreen(win=win, root=self) self.help_screen.visible = False self.help_screen.ascii = False self.add_child(self.help_screen) if interval is not None: if interval < 1.0: self.main_screen.device_panel.set_snapshot_interval(interval) self.main_screen.host_panel.set_snapshot_interval(interval) if interval < 0.5: self.process_metrics_screen.set_snapshot_interval(interval) self.main_screen.process_panel.set_snapshot_interval(interval) self.treeview_screen.set_snapshot_interval(interval) self.keybuffer = KeyBuffer() self.keymaps = KeyMaps(self.keybuffer) self.last_input_time = time.monotonic() self.init_keybindings() @property def messagebox(self): return self._messagebox @messagebox.setter def messagebox(self, value): self.need_redraw = True if self._messagebox is not None: self.remove_child(self._messagebox) self._messagebox = value if value is not None: self._messagebox.visible = True self._messagebox.focused = True self._messagebox.ascii = self.ascii self._messagebox.previous_focused = self.get_focused_obj() self.add_child(self._messagebox) def get_focused_obj(self): if self.messagebox is not None: return self.messagebox return super().get_focused_obj() def update_size(self, termsize=None): n_term_lines, n_term_cols = termsize = super().update_size(termsize=termsize) self.width = n_term_cols - self.x self.height = n_term_lines - self.y for screen in self.container: if hasattr(screen, 'update_size'): screen.update_size(termsize) if self.termsize != termsize: self.termsize = termsize self.need_redraw = True return termsize def poke(self): super().poke() if self.termsize is None: self.update_size() def draw(self): if self.need_redraw: self.win.erase() self.set_base_attr(attr=0) self.color_reset() if self.width >= 79: if self.messagebox is not None: self.set_base_attr(attr='dim') super().draw() return if not self.need_redraw: return n_term_lines, n_term_cols = self.termsize message = ( f'nvitop needs at least a width of 79 to render, the current width is {self.width}.' ) words = iter(message.split()) width = min(max(n_term_cols, 40), n_term_cols, 60) - 10 lines = [next(words)] for word in words: if len(lines[-1]) + len(word) + 1 <= width: lines[-1] += ' ' + word else: lines[-1] = lines[-1].strip() lines.append(word) height, width = len(lines) + 4, max(map(len, lines)) + 4 lines = [f'│ {line.ljust(width - 4)} │' for line in lines] lines = [ '╒' + '═' * (width - 2) + '╕', '│' + ' ' * (width - 2) + '│', *lines, '│' + ' ' * (width - 2) + '│', '╘' + '═' * (width - 2) + '╛', ] y_start, x_start = (n_term_lines - height) // 2, (n_term_cols - width) // 2 for y, line in enumerate(lines, start=y_start): self.addstr(y, x_start, line) def finalize(self): super().finalize() self.win.refresh() def redraw(self): self.poke() self.draw() self.finalize() def loop(self): if self.win is None: return try: while True: self.redraw() self.handle_input() if time.monotonic() - self.last_input_time > 1.0: time.sleep(0.2) except BreakLoop: pass def print(self): self.main_screen.print() def handle_mouse(self): """Handle mouse input.""" try: event = MouseEvent(curses.getmouse()) except curses.error: return super().click(event) def handle_key(self, key): """Handle key input.""" if key < 0: self.keybuffer.clear() elif not super().press(key): self.keymaps.use_keymap('main') self.press(key) def handle_keys(self, *keys): for key in keys: self.handle_key(key) def press(self, key): keybuffer = self.keybuffer keybuffer.add(key) if keybuffer.result is not None: try: keybuffer.result() finally: if keybuffer.finished_parsing: keybuffer.clear() elif keybuffer.finished_parsing: keybuffer.clear() return False return True def handle_input(self): # pylint: disable=too-many-branches key = self.win.getch() if key == curses.ERR: return self.last_input_time = time.monotonic() if key == curses.KEY_ENTER: key = ord('\n') if key == 27 or (128 <= key < 256): # Handle special keys like ALT+X or unicode here: keys = [key] for _ in range(4): getkey = self.win.getch() if getkey != -1: keys.append(getkey) if len(keys) == 1: keys.append(-1) elif keys[0] == 27: keys[0] = ALT_KEY self.handle_keys(*keys) curses.flushinp() elif key >= 0: # Handle simple key presses, CTRL+X, etc here: curses.flushinp() if key == curses.KEY_MOUSE: self.handle_mouse() elif key == curses.KEY_RESIZE: self.update_size() else: self.handle_key(key) def init_keybindings(self): # pylint: disable=multiple-statements,too-many-statements for screen in self.container: if hasattr(screen, 'init_keybindings'): screen.init_keybindings() def show_screen(screen, focused=None): for s in self.container: if s is screen: s.visible = True if focused is not None: s.focused = focused else: s.visible = False self.previous_screen = self.current_screen self.current_screen = screen def show_main(): show_screen(self.main_screen, focused=False) if self.treeview_screen.selection.is_set(): self.main_screen.selection.process = self.treeview_screen.selection.process self.treeview_screen.selection.clear() self.process_metrics_screen.disable() def show_environ(): show_screen(self.environ_screen, focused=True) if self.previous_screen is not self.help_screen: self.environ_screen.process = self.previous_screen.selection.process def environ_return(): if self.previous_screen is self.treeview_screen: show_treeview() elif self.previous_screen is self.process_metrics_screen: show_process_metrics() else: show_main() def show_treeview(): if not self.main_screen.process_panel.has_snapshots: return show_screen(self.treeview_screen, focused=True) if not self.treeview_screen.selection.is_set(): self.treeview_screen.selection.process = self.main_screen.selection.process self.main_screen.selection.clear() def show_process_metrics(): if self.current_screen is self.main_screen: if self.main_screen.selection.is_set(): show_screen(self.process_metrics_screen, focused=True) self.process_metrics_screen.process = self.previous_screen.selection.process elif self.current_screen is not self.treeview_screen: show_screen(self.process_metrics_screen, focused=True) def show_help(): show_screen(self.help_screen, focused=True) def help_return(): if self.previous_screen is self.treeview_screen: show_treeview() elif self.previous_screen is self.environ_screen: show_environ() elif self.previous_screen is self.process_metrics_screen: show_process_metrics() else: show_main() self.keymaps.bind('main', 'e', show_environ) self.keymaps.bind('environ', 'e', environ_return) self.keymaps.copy('environ', 'e', '') self.keymaps.copy('environ', 'e', 'q') self.keymaps.copy('environ', 'e', 'Q') self.keymaps.bind('main', 't', show_treeview) self.keymaps.bind('treeview', 't', show_main) self.keymaps.copy('treeview', 't', 'q') self.keymaps.copy('treeview', 't', 'Q') self.keymaps.bind('treeview', 'e', show_environ) self.keymaps.bind('main', '', show_process_metrics) self.keymaps.bind('process-metrics', '', show_main) self.keymaps.copy('process-metrics', '', '') self.keymaps.copy('process-metrics', '', 'q') self.keymaps.copy('process-metrics', '', 'Q') self.keymaps.bind('process-metrics', 'e', show_environ) for screen in ('main', 'treeview', 'environ', 'process-metrics'): self.keymaps.bind(screen, 'h', show_help) self.keymaps.copy(screen, 'h', '?') self.keymaps.bind('help', '', help_return) self.keymaps.bind('help', '', help_return) self.keymaps.use_keymap('main') nvitop-1.4.2/nvitop/version.py000066400000000000000000000100041474547113600164220ustar00rootroot00000000000000# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # # Copyright 2021-2025 Xuehai Pan. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management.""" # pylint: disable=invalid-name __version__ = '1.4.2' __license__ = 'GPL-3.0-only AND Apache-2.0' __author__ = __maintainer__ = 'Xuehai Pan' __email__ = 'XuehaiPan@pku.edu.cn' __release__ = False if not __release__: import os import subprocess try: prefix, sep, suffix = ( subprocess.check_output( # noqa: S603 ['git', 'describe', '--abbrev=7'], # noqa: S607 cwd=os.path.dirname(os.path.abspath(__file__)), stderr=subprocess.DEVNULL, text=True, ) .strip() .lstrip('v') .replace('-', '.dev', 1) .replace('-', '+', 1) .partition('.dev') ) if sep: version_prefix, dot, version_tail = prefix.rpartition('.') prefix = f'{version_prefix}{dot}{int(version_tail) + 1}' __version__ = f'{prefix}{sep}{suffix}' del version_prefix, dot, version_tail else: __version__ = prefix del prefix, sep, suffix except (OSError, subprocess.CalledProcessError): pass del os, subprocess # The package `nvidia-ml-py` is not backward compatible over releases. This may # cause problems with Old versions of NVIDIA drivers. # The ideal solution is to let the user install the best-fit version of `nvidia-ml-py`. PYNVML_VERSION_CANDIDATES = ( # Sync with pyproject.toml and requirements.txt '11.450.51', # the last version supports the R430 driver (CUDA 10.x) '11.450.129', # requires at last the R450 driver '11.460.79', '11.470.66', '11.495.46', '11.510.69', # the first version supports the `nvmlMemory_v2` API '11.515.48', '11.515.75', '11.525.84', '11.525.112', '11.525.131', '11.525.150', '12.535.77', '12.535.108', '12.535.133', '12.535.161', '12.550.52', '12.550.89', '12.555.43', '12.560.30', '12.570.86', ) """The list of supported ``nvidia-ml-py`` versions. See also: `nvidia-ml-py's Release History `_. To install ``nvitop`` with a specific version of ``nvidia-ml-py``, use ``nvitop[pynvml-xx.yyy.zzz]``, for example: .. code:: bash pip3 install 'nvitop[pynvml-11.450.51]' or .. code:: bash pip3 install nvitop nvidia-ml-py==11.450.51 Note: The package ``nvidia-ml-py`` is not backward compatible over releases. This may cause problems such as *"Function Not Found"* errors with old versions of NVIDIA drivers (e.g. the NVIDIA R430 driver on Ubuntu 16.04 LTS). The ideal solution is to let the user install the best-fit version of ``nvidia-ml-py``. See also: `nvidia-ml-py's Release History `_. ``nvidia-ml-py==11.450.51`` is the last version supports the NVIDIA R430 driver (CUDA 10.x). Since ``nvidia-ml-py>=11.450.129``, the definition of struct ``nvmlProcessInfo_t`` has introduced two new fields ``gpuInstanceId`` and ``computeInstanceId`` (GI ID and CI ID in newer ``nvidia-smi``) which are incompatible with some old NVIDIA drivers. ``nvitop`` may not display the processes correctly due to this incompatibility. """ nvitop-1.4.2/pyproject.toml000066400000000000000000000146021474547113600157700ustar00rootroot00000000000000[build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] name = "nvitop" description = "An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management." readme = "README.md" requires-python = ">= 3.7" authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }] license = { text = "Apache License, Version 2.0 (Apache-2.0) & GNU General Public License, Version 3 (GPL-3.0)" } keywords = [ "nvidia", "nvidia-smi", "NVIDIA", "NVML", "CUDA", "GPU", "top", "monitoring", ] classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Environment :: GPU", "Environment :: GPU :: NVIDIA CUDA", "Environment :: Console", "Environment :: Console :: Curses", "Intended Audience :: Developers", "Intended Audience :: End Users/Desktop", "Intended Audience :: System Administrators", "Topic :: System :: Hardware", "Topic :: System :: Monitoring", "Topic :: System :: Systems Administration", "Topic :: Utilities", ] dependencies = [ # Sync with nvitop/version.py and requirements.txt "nvidia-ml-py >= 11.450.51, < 12.571.0a0", "psutil >= 5.6.6", "colorama >= 0.4.0; platform_system == 'Windows'", "windows-curses >= 2.2.0; platform_system == 'Windows'", ] dynamic = ["version", "optional-dependencies"] [project.scripts] nvitop = "nvitop.cli:main" nvisel = "nvitop.select:main" [project.urls] Homepage = "https://github.com/XuehaiPan/nvitop" Repository = "https://github.com/XuehaiPan/nvitop" Documentation = "https://nvitop.readthedocs.io" "Bug Report" = "https://github.com/XuehaiPan/nvitop/issues" [tool.setuptools.packages.find] include = ["nvitop", "nvitop.*"] [tool.black] line-length = 100 skip-string-normalization = true target-version = ["py37"] [tool.mypy] # Sync with requires-python python_version = "3.8" # appease mypy for syntax errors in numpy stubs mypy_path = [".", "nvitop-exporter"] exclude = ["nvitop-exporter/setup.py"] pretty = true show_column_numbers = true show_error_codes = true show_error_context = true show_traceback = true allow_redefinition = true check_untyped_defs = true disallow_incomplete_defs = true disallow_untyped_defs = true ignore_missing_imports = true no_implicit_optional = true strict_equality = true strict_optional = true warn_no_return = true warn_redundant_casts = true warn_unreachable = true warn_unused_configs = true warn_unused_ignores = true [[tool.mypy.overrides]] module = ['nvitop.callbacks.*', 'nvitop.tui.*'] ignore_errors = true [tool.pylint] main.py-version = "3.7" basic.good-names = ["x", "y", "dx", "dy", "p", "s", "fg", "bg", "n", "ui", "tx", "rx"] format.max-line-length = 120 "messages control".disable = ["consider-using-f-string", "duplicate-code", "wrong-import-order"] spelling.spelling-dict = "en_US" spelling.spelling-private-dict-file = "docs/source/spelling_wordlist.txt" [tool.codespell] ignore-words = "docs/source/spelling_wordlist.txt" [tool.ruff] # Sync with requires-python target-version = "py37" line-length = 100 output-format = "full" src = ["nvitop", "nvitop-exporter/nvitop_exporter"] [tool.ruff.lint] select = [ "E", "W", # pycodestyle "F", # pyflakes "I", # isort "N", # pep8-naming "UP", # pyupgrade "D", # pydocstyle "ANN", # flake8-annotations "S", # flake8-bandit "BLE", # flake8-blind-except "B", # flake8-bugbear "COM", # flake8-commas "C4", # flake8-comprehensions "EXE", # flake8-executable "FA", # flake8-future-annotations "LOG", # flake8-logging "ISC", # flake8-implicit-str-concat "INP", # flake8-no-pep420 "PIE", # flake8-pie "PYI", # flake8-pyi "Q", # flake8-quotes "RSE", # flake8-raise "RET", # flake8-return "SIM", # flake8-simplify "TID", # flake8-tidy-imports "TCH", # flake8-type-checking "PERF", # perflint "FURB", # refurb "TRY", # tryceratops "RUF", # ruff ] ignore = [ # E501: line too long # W505: doc line too long # too long docstring due to long example blocks "E501", "W505", # ANN401: dynamically typed expressions (typing.Any) are disallowed "ANN401", # FURB189: use the `UserDict`, `UserList`, and `UserString` instead # internally subclassing `dict`, `list`, and `str` "FURB189", # S101: use of `assert` detected # internal use and may never raise at runtime "S101", # SIM105: use `contextlib.suppress(...)` instead of try-except-pass # reduce unnecessary function call "SIM105", # TRY003: avoid specifying long messages outside the exception class # long messages are necessary for clarity "TRY003", # RUF022: `__all__` is not ordered according to an "isort-style" sort # `__all__` contains comments to group names "RUF022", ] [tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", # unused-import ] "setup.py" = [ "D", # pydocstyle "ANN", # flake8-annotations ] "nvitop/api/lib*.py" = [ "N", # pep8-naming ] "nvitop/callbacks/*.py" = [ "D", # pydocstyle "ANN", # flake8-annotations ] "nvitop/tui/**/*.py" = [ "D", # pydocstyle "ANN", # flake8-annotations "RUF012", # mutable-class-default ] "docs/source/conf.py" = [ "D", # pydocstyle "INP001", # flake8-no-pep420 ] [tool.ruff.lint.isort] known-first-party = ["nvitop", "nvitop_exporter"] extra-standard-library = ["typing_extensions"] lines-after-imports = 2 [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.flake8-annotations] allow-star-arg-any = true [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" multiline-quotes = "double" inline-quotes = "single" [tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" nvitop-1.4.2/requirements.txt000066400000000000000000000003171474547113600163360ustar00rootroot00000000000000# Sync with pyproject.toml and nvitop/version.py nvidia-ml-py >= 11.450.51, < 12.571.0a0 psutil >= 5.6.6 colorama >= 0.4.0; platform_system == 'Windows' windows-curses >= 2.2.0; platform_system == 'Windows' nvitop-1.4.2/setup.py000077500000000000000000000047451474547113600146000ustar00rootroot00000000000000#!/usr/bin/env python3 # To install `nvitop` with specific version of `nvidia-ml-py`, use: # # pip install nvidia-ml-py==xx.yyy.zz nvitop # # or # # pip install 'nvitop[pynvml-xx.yyy.zz]' # """Setup script for ``nvitop``.""" from __future__ import annotations import contextlib import re import sys from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path from typing import TYPE_CHECKING, Generator from setuptools import setup if TYPE_CHECKING: from types import ModuleType HERE = Path(__file__).absolute().parent @contextlib.contextmanager def vcs_version(name: str, path: Path | str) -> Generator[ModuleType]: """Context manager to update version string in a version module.""" path = Path(path).absolute() assert path.is_file() module_spec = spec_from_file_location(name=name, location=path) assert module_spec is not None assert module_spec.loader is not None module = sys.modules.get(name) if module is None: module = module_from_spec(module_spec) sys.modules[name] = module module_spec.loader.exec_module(module) if module.__release__: yield module return content = None try: try: content = path.read_text(encoding='utf-8') path.write_text( data=re.sub( r"""__version__\s*=\s*('[^']+'|"[^"]+")""", f'__version__ = {module.__version__!r}', string=content, ), encoding='utf-8', ) except OSError: content = None yield module finally: if content is not None: with path.open(mode='wt', encoding='utf-8', newline='') as file: file.write(content) with vcs_version( name='nvitop.version', path=HERE / 'nvitop' / 'version.py', ) as version: setup( name='nvitop', version=version.__version__, extras_require={ 'lint': [ 'black >= 24.0.0, < 25.0.0a0', 'pylint[spelling]', 'mypy', 'typing-extensions', 'pre-commit', ], 'cuda10': ['nvidia-ml-py == 11.450.51'], **{ # The identifier could not start with numbers, add a prefix `pynvml-` f'pynvml-{pynvml}': [f'nvidia-ml-py == {pynvml}'] for pynvml in version.PYNVML_VERSION_CANDIDATES }, }, )