pax_global_header00006660000000000000000000000064141353014200014503gustar00rootroot0000000000000052 comment=8eeeffb362c31af4427b21a84e2ef6cbdddfd0c3 prometheus-node-exporter-collectors-0+git20211024.8eeeffb/000077500000000000000000000000001413530142000232355ustar00rootroot00000000000000prometheus-node-exporter-collectors-0+git20211024.8eeeffb/.circleci/000077500000000000000000000000001413530142000250705ustar00rootroot00000000000000prometheus-node-exporter-collectors-0+git20211024.8eeeffb/.circleci/config.yml000066400000000000000000000006171413530142000270640ustar00rootroot00000000000000--- version: 2.1 orbs: shellcheck: circleci/shellcheck@1.3.15 executors: shellcheck: docker: - image: koalaman/shellcheck python: docker: - image: cimg/python:3.9 jobs: flake8_lint: executor: python steps: - checkout - run: pip install flake8 - run: flake8 workflows: version: 2 scripts: jobs: - shellcheck/check - flake8_lint prometheus-node-exporter-collectors-0+git20211024.8eeeffb/.flake8000066400000000000000000000001321413530142000244040ustar00rootroot00000000000000[flake8] import-order-style = google max-line-length = 100 exclude= .git, .circleci, prometheus-node-exporter-collectors-0+git20211024.8eeeffb/CODE_OF_CONDUCT.md000066400000000000000000000002331413530142000260320ustar00rootroot00000000000000## Prometheus Community Code of Conduct Prometheus follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). prometheus-node-exporter-collectors-0+git20211024.8eeeffb/LICENSE000066400000000000000000000261351413530142000242510ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. prometheus-node-exporter-collectors-0+git20211024.8eeeffb/MAINTAINERS.md000066400000000000000000000001441413530142000253300ustar00rootroot00000000000000* Ben Kochie @SuperQ * Daniel Swarbrick @dswarbrick prometheus-node-exporter-collectors-0+git20211024.8eeeffb/README.md000066400000000000000000000011631413530142000245150ustar00rootroot00000000000000# Text collector example scripts These scripts are examples to be used with the Node Exporter Textfile Collector. To use these scripts, we recommend using a `sponge` to atomically write the output. | sponge Sponge comes from [moreutils](https://joeyh.name/code/moreutils/) * [brew install moreutils](http://brewformulas.org/Moreutil) * [apt install moreutils](https://packages.debian.org/search?keywords=moreutils) * [pkg install moreutils](https://www.freshports.org/sysutils/moreutils/) For more information see: https://github.com/prometheus/node_exporter#textfile-collector prometheus-node-exporter-collectors-0+git20211024.8eeeffb/SECURITY.md000066400000000000000000000002521413530142000250250ustar00rootroot00000000000000# Reporting a security issue The Prometheus security policy, including how to report vulnerabilities, can be found here: https://prometheus.io/docs/operating/security/ prometheus-node-exporter-collectors-0+git20211024.8eeeffb/apt.sh000077500000000000000000000024161413530142000243630ustar00rootroot00000000000000#!/bin/bash # # Description: Expose metrics from apt updates. # # Author: Ben Kochie upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \ | /usr/bin/awk -F'[()]' \ '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ | /usr/bin/sort \ | /usr/bin/uniq -c \ | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2); gsub(/\[/, "", $3); gsub(/\]/, "", $3); print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}' )" autoremove="$(/usr/bin/apt-get --just-print autoremove \ | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}' )" echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' echo '# TYPE apt_upgrades_pending gauge' if [[ -n "${upgrades}" ]] ; then echo "${upgrades}" else echo 'apt_upgrades_pending{origin="",arch=""} 0' fi echo '# HELP apt_autoremove_pending Apt package pending autoremove.' echo '# TYPE apt_autoremove_pending gauge' echo "${autoremove}" echo '# HELP node_reboot_required Node reboot is required for software updates.' echo '# TYPE node_reboot_required gauge' if [[ -f '/run/reboot-required' ]] ; then echo 'node_reboot_required 1' else echo 'node_reboot_required 0' fi prometheus-node-exporter-collectors-0+git20211024.8eeeffb/btrfs_stats.py000077500000000000000000000071531413530142000261560ustar00rootroot00000000000000#!/usr/bin/env python3 # Collect per-device btrfs filesystem errors. # Designed to work on Debian and Centos 6 (with python2.6). import glob import os import re import subprocess def get_btrfs_mount_points(): """List all btrfs mount points. Yields: (string) filesystem mount points. """ with open("/proc/mounts") as f: for line in f: parts = line.split() if parts[2] == "btrfs": yield parts[1] def get_btrfs_errors(mountpoint): """Get per-device errors for a btrfs mount point. Args: mountpoint: (string) path to a mount point. Yields: (device, error_type, error_count) tuples, where: device: (string) path to block device. error_type: (string) type of btrfs error. error_count: (int) number of btrfs errors of a given type. """ p = subprocess.Popen(["btrfs", "device", "stats", mountpoint], stdout=subprocess.PIPE) (stdout, stderr) = p.communicate() if p.returncode != 0: raise RuntimeError("btrfs returned exit code %d" % p.returncode) for line in stdout.splitlines(): if not line: continue # Sample line: # [/dev/vdb1].flush_io_errs 0 m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8")) if not m: raise RuntimeError("unexpected output from btrfs: '%s'" % line) yield m.group(1), m.group(2), int(m.group(3)) def btrfs_error_metrics(): """Collect btrfs error metrics. Returns: a list of strings to be exposed as Prometheus metrics. """ metric = "node_btrfs_errors_total" contents = [ "# TYPE %s counter" % metric, "# HELP %s number of btrfs errors" % metric, ] for mountpoint in get_btrfs_mount_points(): for device, error_type, error_count in get_btrfs_errors(mountpoint): contents.append( '%s{mountpoint="%s",device="%s",type="%s"} %d' % (metric, mountpoint, device, error_type, error_count)) if len(contents) > 2: # return metrics if there are actual btrfs filesystems found # (i.e. `contents` contains more than just TYPE and HELP). return contents else: return [] def btrfs_allocation_metrics(): """Collect btrfs allocation metrics. Returns: a list of strings to be exposed as Prometheus metrics. """ prefix = 'node_btrfs_allocation' metric_to_filename = { 'size_bytes': 'total_bytes', 'used_bytes': 'bytes_used', 'reserved_bytes': 'bytes_reserved', 'pinned_bytes': 'bytes_pinned', 'disk_size_bytes': 'disk_total', 'disk_used_bytes': 'disk_used', } contents = [] for m, f in metric_to_filename.items(): contents += [ "# TYPE %s_%s gauge" % (prefix, m), "# HELP %s_%s btrfs allocation data (%s)" % (prefix, m, f), ] for alloc in glob.glob("/sys/fs/btrfs/*/allocation"): fs = alloc.split('/')[4] for type_ in ('data', 'metadata', 'system'): for m, f in metric_to_filename.items(): filename = os.path.join(alloc, type_, f) with open(filename) as f: value = int(f.read().strip()) contents.append('%s_%s{fs="%s",type="%s"} %d' % ( prefix, m, fs, type_, value)) if len(contents) > 2 * len(metric_to_filename): return contents else: return [] if __name__ == "__main__": contents = btrfs_error_metrics() + btrfs_allocation_metrics() print("\n".join(contents)) prometheus-node-exporter-collectors-0+git20211024.8eeeffb/chrony.py000077500000000000000000000034361413530142000251220ustar00rootroot00000000000000#!/usr/bin/env python3 # # Description: Gather metrics from Chrony NTP. # import subprocess import sys from prometheus_client import CollectorRegistry, Gauge, generate_latest def chronyc(*args, check=True): """Chrony client wrapper Returns: (str) Data piped to stdout by the chrony subprocess. """ return subprocess.run( ['chronyc', *args], stdout=subprocess.PIPE, check=check ).stdout.decode('utf-8') def chronyc_tracking(): return chronyc('-c', 'tracking').split(',') def main(): registry = CollectorRegistry() chrony_tracking = chronyc_tracking() if len(chrony_tracking) != 14: print("ERROR: Unable to parse chronyc tracking CSV", file=sys.stderr) sys.exit(1) g = Gauge('chrony_tracking_reference_info', 'The stratum of the current preferred source', ['ref_id', 'ref_host'], registry=registry) g.labels(chrony_tracking[0], chrony_tracking[1]).set(1) g = Gauge('chrony_tracking_stratum', 'The stratum of the current preferred source', registry=registry) g.set(chrony_tracking[2]) g = Gauge('chrony_tracking_system_offset_seconds', 'The current estimated drift of system time from true time', registry=registry) g.set(chrony_tracking[4]) g = Gauge('chrony_tracking_last_offset_seconds', 'The estimated local offset on the last clock update.', registry=registry) g.set(chrony_tracking[5]) g = Gauge('chrony_tracking_root_dispersion_seconds', 'The absolute bound on the computer’s clock accuracy', registry=registry) g.set(chrony_tracking[5]) print(generate_latest(registry).decode("utf-8"), end='') if __name__ == "__main__": main() prometheus-node-exporter-collectors-0+git20211024.8eeeffb/deleted_libraries.py000077500000000000000000000051551413530142000272620ustar00rootroot00000000000000#!/usr/bin/env python3 """ Script to count the number of deleted libraries that are linked by running processes and expose a summary as Prometheus metrics. The aim is to discover processes that are still using libraries that have since been updated, perhaps due security vulnerabilities. """ import errno import glob import os import sys def main(): processes_linking_deleted_libraries = {} for path in glob.glob('/proc/*/maps'): try: with open(path, 'rb') as file: for line in file: part = line.decode().strip().split() if len(part) == 7: library = part[5] comment = part[6] if '/lib/' in library and '(deleted)' in comment: if path not in processes_linking_deleted_libraries: processes_linking_deleted_libraries[path] = {} if library in processes_linking_deleted_libraries[path]: processes_linking_deleted_libraries[path][library] += 1 else: processes_linking_deleted_libraries[path][library] = 1 except EnvironmentError as e: # Ignore non-existent files, since the files may have changed since # we globbed. if e.errno != errno.ENOENT: sys.exit('Failed to open file: {0}'.format(path)) num_processes_per_library = {} for process, library_count in processes_linking_deleted_libraries.items(): libraries_seen = set() for library, count in library_count.items(): if library in libraries_seen: continue libraries_seen.add(library) if library in num_processes_per_library: num_processes_per_library[library] += 1 else: num_processes_per_library[library] = 1 metric_name = 'node_processes_linking_deleted_libraries' description = 'Count of running processes that link a deleted library' print('# HELP {0} {1}'.format(metric_name, description)) print('# TYPE {0} gauge'.format(metric_name)) for library, count in num_processes_per_library.items(): dir_path, basename = os.path.split(library) basename = basename.replace('"', '\\"') dir_path = dir_path.replace('"', '\\"') print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format( metric_name, dir_path, basename, count) ) if __name__ == "__main__": main() prometheus-node-exporter-collectors-0+git20211024.8eeeffb/directory-size.sh000077500000000000000000000012101413530142000265420ustar00rootroot00000000000000#!/bin/sh # # Expose directory usage metrics, passed as an argument. # # Usage: add this to crontab: # # */5 * * * * prometheus directory-size.sh /var/lib/prometheus | sponge /var/lib/node_exporter/directory_size.prom # # sed pattern taken from https://www.robustperception.io/monitoring-directory-sizes-with-the-textfile-collector/ # # Author: Antoine Beaupré echo "# HELP node_directory_size_bytes Disk space used by some directories" echo "# TYPE node_directory_size_bytes gauge" du --block-size=1 --summarize "$@" \ | sed -ne 's/\\/\\\\/;s/"/\\"/g;s/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p' prometheus-node-exporter-collectors-0+git20211024.8eeeffb/inotify-instances000077500000000000000000000066561413530142000266460ustar00rootroot00000000000000#!/usr/bin/env python3 """ Expose Linux inotify(7) instance resource consumption. Operational properties: - This script may be invoked as an unprivileged user; in this case, metrics will only be exposed for processes owned by that unprivileged user. - No metrics will be exposed for processes that do not hold any inotify fds. Requires Python 3.5 or later. """ import collections import os import sys class Error(Exception): pass class _PIDGoneError(Error): pass _Process = collections.namedtuple( "Process", ["pid", "uid", "command", "inotify_instances"]) def _read_bytes(name): with open(name, mode='rb') as f: return f.read() def _pids(): for n in os.listdir("/proc"): if not n.isdigit(): continue yield int(n) def _pid_uid(pid): try: s = os.stat("/proc/{}".format(pid)) except FileNotFoundError: raise _PIDGoneError() return s.st_uid def _pid_command(pid): # Avoid GNU ps(1) for it truncates comm. # https://bugs.launchpad.net/ubuntu/+source/procps/+bug/295876/comments/3 try: cmdline = _read_bytes("/proc/{}/cmdline".format(pid)) except FileNotFoundError: raise _PIDGoneError() if not len(cmdline): return "" try: prog = cmdline[0:cmdline.index(0x00)] except ValueError: prog = cmdline return os.path.basename(prog).decode(encoding="ascii", errors="surrogateescape") def _pid_inotify_instances(pid): instances = 0 try: for fd in os.listdir("/proc/{}/fd".format(pid)): try: target = os.readlink("/proc/{}/fd/{}".format(pid, fd)) except FileNotFoundError: continue if target == "anon_inode:inotify": instances += 1 except FileNotFoundError: raise _PIDGoneError() return instances def _get_processes(): for p in _pids(): try: yield _Process(p, _pid_uid(p), _pid_command(p), _pid_inotify_instances(p)) except (PermissionError, _PIDGoneError): continue def _get_processes_nontrivial(): return (p for p in _get_processes() if p.inotify_instances > 0) def _format_gauge_metric(metric_name, metric_help, samples, value_func, tags_func=None, stream=sys.stdout): def _println(*args, **kwargs): if "file" not in kwargs: kwargs["file"] = stream print(*args, **kwargs) def _print(*args, **kwargs): if "end" not in kwargs: kwargs["end"] = "" _println(*args, **kwargs) _println("# HELP {} {}".format(metric_name, metric_help)) _println("# TYPE {} gauge".format(metric_name)) for s in samples: value = value_func(s) tags = None if tags_func: tags = tags_func(s) _print(metric_name) if tags: _print("{") _print(",".join(["{}=\"{}\"".format(k, v) for k, v in tags])) _print("}") _print(" ") _println(value) def main(args_unused=None): _format_gauge_metric( "inotify_instances", "Total number of inotify instances held open by a process.", _get_processes_nontrivial(), lambda s: s.inotify_instances, lambda s: [("pid", s.pid), ("uid", s.uid), ("command", s.command)]) if __name__ == "__main__": sys.exit(main(sys.argv)) prometheus-node-exporter-collectors-0+git20211024.8eeeffb/ipmitool000077500000000000000000000033461413530142000250250ustar00rootroot00000000000000#!/usr/bin/awk -f # # Converts output of `ipmitool sensor` to prometheus format. # # With GNU awk: # ipmitool sensor | ./ipmitool > ipmitool.prom # # With BSD awk: # ipmitool sensor | awk -f ./ipmitool > ipmitool.prom # function export(values, name) { if (values["metric_count"] < 1) { return } delete values["metric_count"] printf("# HELP %s%s %s sensor reading from ipmitool\n", namespace, name, help[name]); printf("# TYPE %s%s gauge\n", namespace, name); for (sensor in values) { printf("%s%s{sensor=\"%s\"} %f\n", namespace, name, sensor, values[sensor]); } } # Fields are Bar separated, with space padding. BEGIN { FS = "[ ]*[|][ ]*"; namespace = "node_ipmi_"; # Friendly description of the type of sensor for HELP. help["temperature_celsius"] = "Temperature"; help["volts"] = "Voltage"; help["power_watts"] = "Power"; help["speed_rpm"] = "Fan"; help["status"] = "Chassis status"; temperature_celsius["metric_count"] = 0; volts["metric_count"] = 0; power_watts["metric_count"] = 0; speed_rpm["metric_count"] = 0; status["metric_count"] = 0; } # Not a valid line. { if (NF < 3) { next } } # $2 is value field. $2 ~ /na/ { next } # $3 is type field. $3 ~ /degrees C/ { temperature_celsius[$1] = $2; temperature_celsius["metric_count"]++; } $3 ~ /Volts/ { volts[$1] = $2; volts["metric_count"]++; } $3 ~ /Watts/ { power_watts[$1] = $2; power_watts["metric_count"]++; } $3 ~ /RPM/ { speed_rpm[$1] = $2; speed_rpm["metric_count"]++; } $3 ~ /discrete/ { status[$1] = sprintf("%d", substr($2,3,2)); status["metric_count"]++; } END { export(temperature_celsius, "temperature_celsius"); export(volts, "volts"); export(power_watts, "power_watts"); export(speed_rpm, "speed_rpm"); export(status, "status"); } prometheus-node-exporter-collectors-0+git20211024.8eeeffb/lvm-prom-collector000077500000000000000000000104111413530142000267150ustar00rootroot00000000000000#!/bin/bash # Expose various types of information about lvm2 # # Usage: lvm-prom-collector # # Options: # # -g for used and free space of logical volume groups # -p for used and free space of physical volumes. # -s for the percentage usage of the snapshots # -t for the percentage usage of the thin pools # # * * * * * root lvm-prom-collector -g | sponge /var/lib/prometheus/node-exporter/lvm.prom # # This will expose every minute information about the logical volume groups # # Author: Badreddin Aboubakr set -eu display_usage() { echo "This script must be run with super-user privileges." echo "Usage: lvm-prom-collector options" echo "Options:" echo "Expose various types of information about lvm2" echo "Use -g for used and free space of logical volume groups." echo "use -p for used and free space of physical volumes." echo "Use -s for the percentage usage of the snapshots." echo "Use -t for the percentage usage of the thin pools." } if [ "$(id -u)" != "0" ]; then 1>&2 echo "This script must be run with super-user privileges." exit 1 fi if [ $# -eq 0 ] then display_usage exit 1 fi thin_pools=false snapshots=false physical=false groups=false while getopts "htpsg" opt; do case $opt in p) physical=true ;; s) snapshots=true ;; g) groups=true ;; t) thin_pools=true ;; h) display_usage exit 0 ;; \?) display_usage exit 1 ;; esac done if [ "$physical" = true ] ; then echo "# HELP node_physical_volume_size Physical volume size in bytes" echo "# TYPE node_physical_volume_size gauge" echo "# HELP node_physical_volume_free Physical volume free space in bytes" echo "# TYPE node_physical_volume_free gauge" pvs_output=$(pvs --noheadings --units b --nosuffix --nameprefixes --unquoted --options pv_name,pv_fmt,pv_free,pv_size,pv_uuid 2>/dev/null) echo "$pvs_output" | while IFS= read -r line ; do # Skip if the line is empty [ -z "$line" ] && continue declare $line echo "node_physical_volume_size{name=\"$LVM2_PV_NAME\", uuid=\"$LVM2_PV_UUID\", format=\"$LVM2_PV_FMT\"} $LVM2_PV_SIZE" echo "node_physical_volume_free{name=\"$LVM2_PV_NAME\", uuid=\"$LVM2_PV_UUID\", format=\"$LVM2_PV_FMT\"} $LVM2_PV_FREE" done fi if [ "$snapshots" = true ] ; then echo "# HELP node_lvm_snapshots_allocated percentage of allocated data to a snapshot" echo "# TYPE node_lvm_snapshots_allocated gauge" lvs_output=$(lvs --noheadings --select 'lv_attr=~[^s.*]' --units b --nosuffix --unquoted --nameprefixes --options lv_uuid,vg_name,data_percent 2>/dev/null) echo "$lvs_output" | while IFS= read -r line ; do # Skip if the line is empty [ -z "$line" ] && continue declare $line # Convert ',' to '.' data_percent=$(echo "$LVM2_DATA_PERCENT" | sed 's/\,/./' ) echo "node_lvm_snapshots_allocated{uuid=\"$LVM2_LV_UUID\", vgroup=\"$LVM2_VG_NAME\"} $data_percent" done fi if [ "$thin_pools" = true ] ; then echo "# HELP node_lvm_thin_pools_allocated percentage of allocated thin pool data" echo "# TYPE node_lvm_thin_pools_allocated gauge" lvs_output=$(lvs --noheadings --select 'lv_attr=~[^t.*]' --units b --nosuffix --unquoted --nameprefixes --options lv_uuid,vg_name,data_percent 2>/dev/null) echo "$lvs_output" | while IFS= read -r line ; do # Skip if the line is empty [ -z "$line" ] && continue declare $line # Convert ',' to '.' data_percent=$(echo "$LVM2_DATA_PERCENT" | sed 's/\,/./' ) echo "node_lvm_thin_pools_allocated{uuid=\"$LVM2_LV_UUID\", vgroup=\"$LVM2_VG_NAME\"} $data_percent" done fi if [ "$groups" = true ] ; then echo "# HELP node_volume_group_size Volume group size in bytes" echo "# TYPE node_volume_group_size gauge" echo "# HELP node_volume_group_free volume group free space in bytes" echo "# TYPE node_volume_group_free gauge" vgs_output=$(vgs --noheadings --units b --nosuffix --unquoted --nameprefixes --options vg_name,vg_free,vg_size 2>/dev/null) echo "$vgs_output" | while IFS= read -r line ; do # Skip if the line is empty [ -z "$line" ] && continue declare $line echo "node_volume_group_size{name=\"$LVM2_VG_NAME\"} $LVM2_VG_SIZE" echo "node_volume_group_free{name=\"$LVM2_VG_NAME\"} $LVM2_VG_FREE" done fi prometheus-node-exporter-collectors-0+git20211024.8eeeffb/md_info.sh000077500000000000000000000041071413530142000252110ustar00rootroot00000000000000#!/usr/bin/env bash set -eu for MD_DEVICE in /dev/md/*; do # Subshell to avoid eval'd variables from leaking between iterations ( # Resolve symlink to discover device, e.g. /dev/md127 MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}") # Remove /dev/ prefix MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/} MD_DEVICE=${MD_DEVICE#/dev/md/} # Query sysfs for info about md device SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md" MD_LAYOUT=$(cat "${SYSFS_BASE}/layout") MD_LEVEL=$(cat "${SYSFS_BASE}/level") MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version") MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks") # Remove 'raid' prefix from RAID level MD_LEVEL=${MD_LEVEL#raid} # Output disk metrics for RAID_DISK in "${SYSFS_BASE}"/rd[0-9]*; do DISK=$(readlink -f "${RAID_DISK}/block") DISK_DEVICE=$(basename "${DISK}") RAID_DISK_DEVICE=$(basename "${RAID_DISK}") RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd} RAID_DISK_STATE=$(cat "${RAID_DISK}/state") DISK_SET="" # Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then NEAR_COPIES=$((MD_LAYOUT & 0xff)) FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff)) COPIES=$((NEAR_COPIES * FAR_COPIES)) if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then DISK_SET=$((RAID_DISK_INDEX % COPIES)) fi fi echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\"" if [[ -n ${DISK_SET} ]]; then SET_LETTERS=({A..Z}) echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\"" fi echo "} 1" done # Output RAID array metrics # NOTE: Metadata version is a label rather than a separate metric because the version can be a string echo "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} 1" ) done prometheus-node-exporter-collectors-0+git20211024.8eeeffb/md_info_detail.sh000077500000000000000000000103101413530142000265240ustar00rootroot00000000000000#!/usr/bin/env bash # Note: This script uses "mdadm --detail" to get some of the metrics, so it must be run as root. # It is designed to be run periodically in a cronjob, and output to /var/lib/node_exporter/textfile_collector/md_info_detail.prom # $ cat /etc/cron.d/prometheus_md_info_detail # * * * * * bash /var/lib/node_exporter/md_info_detail.sh > /var/lib/node_exporter/md_info_detail.prom.$$ && mv /var/lib/node_exporter/md_info_detail.prom.$$ /var/lib/node_exporter/md_info_detail.prom set -eu for MD_DEVICE in /dev/md/*; do # Subshell to avoid eval'd variables from leaking between iterations ( # Resolve symlink to discover device, e.g. /dev/md127 MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}") # Remove /dev/ prefix MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/} MD_DEVICE=${MD_DEVICE#/dev/md/} # Query sysfs for info about md device SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md" MD_LAYOUT=$(cat "${SYSFS_BASE}/layout") MD_LEVEL=$(cat "${SYSFS_BASE}/level") MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version") MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks") # Remove 'raid' prefix from RAID level MD_LEVEL=${MD_LEVEL#raid} # Output disk metrics for RAID_DISK in "${SYSFS_BASE}"/rd[0-9]*; do DISK=$(readlink -f "${RAID_DISK}/block") DISK_DEVICE=$(basename "${DISK}") RAID_DISK_DEVICE=$(basename "${RAID_DISK}") RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd} RAID_DISK_STATE=$(cat "${RAID_DISK}/state") DISK_SET="" # Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then NEAR_COPIES=$((MD_LAYOUT & 0xff)) FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff)) COPIES=$((NEAR_COPIES * FAR_COPIES)) if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then DISK_SET=$((RAID_DISK_INDEX % COPIES)) fi fi echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\"" if [[ -n ${DISK_SET} ]]; then SET_LETTERS=({A..Z}) echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\"" fi echo "} 1" done # Get output from mdadm --detail (Note: root/sudo required) MDADM_DETAIL_OUTPUT=$(mdadm --detail /dev/"${MD_DEVICE_NUM}") # Output RAID "Devices", "Size" and "Event" metrics, from the output of "mdadm --detail" while IFS= read -r line ; do # Filter out these keys that have numeric values that increment up if echo "$line" | grep -E -q "Devices :|Array Size :| Used Dev Size :|Events :"; then MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-') MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2 | cut -d " " -f 2 | sed 's:^ ::') echo "node_md_info_${MDADM_DETAIL_KEY}{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} ${MDADM_DETAIL_VALUE}" fi done <<< "$MDADM_DETAIL_OUTPUT" # Output RAID detail metrics info from the output of "mdadm --detail" # NOTE: Sending this info as labels rather than separate metrics, because some of them can be strings. echo -n "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\"" while IFS= read -r line ; do # Filter for lines with a ":", to use for Key/Value pairs in labels if echo "$line" | grep -E -q ":" ; then # Exclude lines with these keys, as they're values are numbers that increment up and captured in individual metrics above if echo "$line" | grep -E -qv "Array Size|Used Dev Size|Events|Update Time" ; then echo -n ", " MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-') MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2- | sed 's:^ ::') echo -n "${MDADM_DETAIL_KEY}=\"${MDADM_DETAIL_VALUE}\"" fi fi done <<< "$MDADM_DETAIL_OUTPUT" echo "} 1" ) done prometheus-node-exporter-collectors-0+git20211024.8eeeffb/mellanox_hca_temp000077500000000000000000000033551413530142000266500ustar00rootroot00000000000000#!/bin/bash set -eu # Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool # Copyright 2018 The Prometheus Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Author: Jan Phillip Greimann # check if root if [ "$EUID" -ne 0 ]; then echo "${0##*/}: Please run as root!" >&2 exit 1 fi # check if programs are installed if ! command -v mget_temp_ext >/dev/null 2>&1; then echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2 exit 1 fi cat <&2 fi done # if device is empty, no device was found if [ -z "${device-}" ]; then echo "${0##*/}: No InfiniBand HCA device found!" >&2 exit 1 fi prometheus-node-exporter-collectors-0+git20211024.8eeeffb/multipathd_info000077500000000000000000000006411413530142000263520ustar00rootroot00000000000000#!/bin/sh # # Description: Expose device mapper multipathing metrics from multipathd. # # Author: Saket Sinha echo '# HELP node_dmpath_info State info for dev-mapper path' echo '# TYPE node_dmpath_info gauge' /sbin/multipathd show paths format '%d %t %T' | /usr/bin/awk '{ if ( NR > 1) {print "node_dmpath_info{device=\""$1"\"," "dm_path_state=\""$2"\"," "path_state=\""$3"\"}" " 1"}}' prometheus-node-exporter-collectors-0+git20211024.8eeeffb/ntpd_metrics.py000077500000000000000000000066561413530142000263220ustar00rootroot00000000000000#!/usr/bin/env python3 # # Description: Extract NTPd metrics from ntpq -np. # Author: Ben Kochie import re import subprocess import sys # NTP peers status, with no DNS lookups. ntpq_cmd = ['ntpq', '-np'] ntpq_rv_cmd = ['ntpq', '-c', 'rv 0 offset,sys_jitter,rootdisp,rootdelay'] # Regex to match all of the fields in the output of ntpq -np metrics_fields = [ r'^(?P.)(?P[\w\.]+)', r'(?P[\w\.]+)', r'(?P\d+)', r'(?P\w)', r'(?P\d+)', r'(?P\d+)', r'(?P\d+)', r'(?P\d+\.\d+)', r'(?P-?\d+\.\d+)', r'(?P\d+\.\d+)', ] metrics_re = r'\s+'.join(metrics_fields) # Remote types # http://support.ntp.org/bin/view/Support/TroubleshootingNTP remote_types = { 'l': 'local', 'u': 'unicast', 'm': 'multicast', 'b': 'broadcast', '-': 'netaddr', } # Status codes: # http://www.eecis.udel.edu/~mills/ntp/html/decode.html#peer status_types = { ' ': 0, 'x': 1, '.': 2, '-': 3, '+': 4, '#': 5, '*': 6, 'o': 7, } # Run the ntpq command. def get_output(command): try: output = subprocess.check_output(command, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError: return None return output.decode() # Print metrics in Prometheus format. def print_prometheus(metric, values): print("# HELP ntpd_%s NTPd metric for %s" % (metric, metric)) print("# TYPE ntpd_%s gauge" % (metric)) for labels in values: if labels is None: print("ntpd_%s %f" % (metric, values[labels])) else: print("ntpd_%s{%s} %f" % (metric, labels, values[labels])) # Parse raw ntpq lines. def parse_line(line): if re.match(r'\s+remote\s+refid', line): return None if re.match(r'=+', line): return None if re.match(r'.+\.(LOCL|POOL)\.', line): return None if re.match(r'^$', line): return None return re.match(metrics_re, line) # Main function def main(argv): ntpq = get_output(ntpq_cmd) peer_status_metrics = {} delay_metrics = {} offset_metrics = {} jitter_metrics = {} for line in ntpq.split('\n'): metric_match = parse_line(line) if metric_match is None: continue remote = metric_match.group('remote') refid = metric_match.group('refid') stratum = metric_match.group('stratum') remote_type = remote_types[metric_match.group('type')] common_labels = "remote=\"%s\",reference=\"%s\"" % (remote, refid) peer_labels = "%s,stratum=\"%s\",type=\"%s\"" % (common_labels, stratum, remote_type) peer_status_metrics[peer_labels] = float(status_types[metric_match.group('status')]) delay_metrics[common_labels] = float(metric_match.group('delay')) offset_metrics[common_labels] = float(metric_match.group('offset')) jitter_metrics[common_labels] = float(metric_match.group('jitter')) print_prometheus('peer_status', peer_status_metrics) print_prometheus('delay_milliseconds', delay_metrics) print_prometheus('offset_milliseconds', offset_metrics) print_prometheus('jitter_milliseconds', jitter_metrics) ntpq_rv = get_output(ntpq_rv_cmd) for metric in ntpq_rv.split(','): metric_name, metric_value = metric.strip().split('=') print_prometheus(metric_name, {None: float(metric_value)}) # Go go go! if __name__ == "__main__": main(sys.argv[1:]) prometheus-node-exporter-collectors-0+git20211024.8eeeffb/nvme_metrics.sh000077500000000000000000000070361413530142000262750ustar00rootroot00000000000000#!/usr/bin/env bash set -eu # Dependencies: nvme-cli, jq (packages) # Based on code from # - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh # - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp # - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh # # Author: Henk # Check if we are root if [ "$EUID" -ne 0 ]; then echo "${0##*/}: Please run as root!" >&2 exit 1 fi # Check if programs are installed if ! command -v nvme >/dev/null 2>&1; then echo "${0##*/}: nvme is not installed. Aborting." >&2 exit 1 fi output_format_awk="$( cat <<'OUTPUTAWK' BEGIN { v = "" } v != $1 { print "# HELP nvme_" $1 " SMART metric " $1; if ($1 ~ /_total$/) print "# TYPE nvme_" $1 " counter"; else print "# TYPE nvme_" $1 " gauge"; v = $1 } {print "nvme_" $0} OUTPUTAWK )" format_output() { sort | awk -F'{' "${output_format_awk}" } # Get the nvme-cli version nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output # Get devices device_list="$(nvme list -o json | jq -r '.Devices | .[].DevicePath')" # Loop through the NVMe devices for device in ${device_list}; do json_check="$(nvme smart-log -o json "${device}")" disk="${device##*/}" # The temperature value in JSON is in Kelvin, we want Celsius value_temperature="$(echo "$json_check" | jq '.temperature - 273')" echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" value_media_errors="$(echo "$json_check" | jq '.media_errors')" echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" done | format_output prometheus-node-exporter-collectors-0+git20211024.8eeeffb/pacman.sh000077500000000000000000000015641413530142000250410ustar00rootroot00000000000000#!/bin/bash # # # Description: Expose metrics from pacman updates # If installed The bash script *checkupdates*, included with the # *pacman-contrib* package, is used to calculate the number of pending updates. # Otherwise *pacman* is used for calculation. # # Author: Sven Haardiek set -o errexit set -o nounset set -o pipefail if [ -x /usr/bin/checkupdates ] then updates=$(/usr/bin/checkupdates | wc -l) cache=0 else if ! updates=$(/usr/bin/pacman -Qu | wc -l) then updates=0 fi cache=1 fi echo "# HELP updates_pending number of pending updates from pacman" echo "# TYPE updates_pending gauge" echo "pacman_updates_pending $updates" echo "# HELP pacman_updates_pending_from_cache pending updates information are from cache" echo "# TYPE pacman_updates_pending_from_cache gauge" echo "pacman_updates_pending_from_cache $cache" prometheus-node-exporter-collectors-0+git20211024.8eeeffb/smartmon.py000077500000000000000000000254711413530142000254630ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import collections import csv import datetime import decimal import re import shlex import subprocess import sys device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') ata_error_count_re = re.compile( r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) device_info_map = { 'Vendor': 'vendor', 'Product': 'product', 'Revision': 'revision', 'Logical Unit id': 'lun_id', 'Model Family': 'model_family', 'Device Model': 'device_model', 'Serial Number': 'serial_number', 'Firmware Version': 'firmware_version', } smart_attributes_whitelist = { 'airflow_temperature_cel', 'command_timeout', 'current_pending_sector', 'end_to_end_error', 'erase_fail_count_total', 'g_sense_error_rate', 'hardware_ecc_recovered', 'host_reads_mib', 'host_reads_32mib', 'host_writes_mib', 'host_writes_32mib', 'load_cycle_count', 'media_wearout_indicator', 'wear_leveling_count', 'nand_writes_1gib', 'offline_uncorrectable', 'power_cycle_count', 'power_on_hours', 'program_fail_count', 'raw_read_error_rate', 'reallocated_event_count', 'reallocated_sector_ct', 'reported_uncorrect', 'sata_downshift_count', 'seek_error_rate', 'spin_retry_count', 'spin_up_time', 'start_stop_count', 'temperature_case', 'temperature_celsius', 'temperature_internal', 'total_lbas_read', 'total_lbas_written', 'udma_crc_error_count', 'unsafe_shutdown_count', 'workld_host_reads_perc', 'workld_media_wear_indic', 'workload_minutes', } Metric = collections.namedtuple('Metric', 'name labels value') SmartAttribute = collections.namedtuple('SmartAttribute', [ 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', 'when_failed', 'raw_value', ]) class Device(collections.namedtuple('DeviceBase', 'path opts')): """Representation of a device as found by smartctl --scan output.""" @property def type(self): return self.opts.type @property def base_labels(self): return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} def smartctl_select(self): return ['--device', self.type, self.path] def metric_key(metric, prefix=''): return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) def metric_format(metric, prefix=''): key = metric_key(metric, prefix) labels = ','.join( '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items()) value = decimal.Decimal(metric.value) return '{key}{{{labels}}} {value}'.format( key=key, labels=labels, value=value) def metric_print_meta(metric, prefix=''): key = metric_key(metric, prefix) print('# HELP {key} SMART metric {metric.name}'.format( key=key, metric=metric)) print('# TYPE {key} gauge'.format(key=key)) def metric_print(metric, prefix=''): print(metric_format(metric, prefix)) def smart_ctl(*args, check=True): """Wrapper around invoking the smartctl binary. Returns: (str) Data piped to stdout by the smartctl subprocess. """ return subprocess.run( ['smartctl', *args], stdout=subprocess.PIPE, check=check ).stdout.decode('utf-8') def smart_ctl_version(): return smart_ctl('-V').split('\n')[0].split()[1] def find_devices(): """Find SMART devices. Yields: (Device) Single device found by smartctl. """ parser = argparse.ArgumentParser() parser.add_argument('-d', '--device', dest='type') devices = smart_ctl('--scan-open') for device in devices.split('\n'): device = device.strip() if not device: continue tokens = shlex.split(device, comments=True) if not tokens: continue yield Device(tokens[0], parser.parse_args(tokens[1:])) def device_is_active(device): """Returns whenever the given device is currently active or not. Args: device: (Device) Device in question. Returns: (bool) True if the device is active and False otherwise. """ try: smart_ctl('--nocheck', 'standby', *device.smartctl_select()) except subprocess.CalledProcessError: return False return True def device_info(device): """Query device for basic model information. Args: device: (Device) Device in question. Returns: (generator): Generator yielding: key (str): Key describing the value. value (str): Actual value. """ info_lines = smart_ctl( '--info', *device.smartctl_select() ).strip().split('\n')[3:] matches = (device_info_re.match(line) for line in info_lines) return (m.groups() for m in matches if m is not None) def device_smart_capabilities(device): """Returns SMART capabilities of the given device. Args: device: (Device) Device in question. Returns: (tuple): tuple containing: (bool): True whenever SMART is available, False otherwise. (bool): True whenever SMART is enabled, False otherwise. """ groups = device_info(device) state = { g[1].split(' ', 1)[0] for g in groups if g[0] == 'SMART support'} smart_available = 'Available' in state smart_enabled = 'Enabled' in state return smart_available, smart_enabled def collect_device_info(device): """Collect basic device information. Args: device: (Device) Device in question. Yields: (Metric) metrics describing general device information. """ values = dict(device_info(device)) yield Metric('device_info', { **device.base_labels, **{v: values[k] for k, v in device_info_map.items() if k in values} }, True) def collect_device_health_self_assessment(device): """Collect metric about the device health self assessment. Args: device: (Device) Device in question. Yields: (Metric) Device health self assessment. """ out = smart_ctl('--health', *device.smartctl_select(), check=False) self_assessment_passed = bool(self_test_re.search(out)) yield Metric( 'device_smart_healthy', device.base_labels, self_assessment_passed) def collect_ata_metrics(device): # Fetch SMART attributes for the given device. attributes = smart_ctl( '--attributes', *device.smartctl_select() ) # replace multiple occurrences of whitespace with a single whitespace # so that the CSV Parser recognizes individual columns properly. attributes = re.sub(r'[\t\x20]+', ' ', attributes) # Turn smartctl output into a list of lines and skip to the table of # SMART attributes. attribute_lines = attributes.strip().split('\n')[7:] # Some attributes have multiple IDs but have the same name. Don't # yield attributes that already have been reported before. seen = set() reader = csv.DictReader( (line.strip() for line in attribute_lines), fieldnames=SmartAttribute._fields[:-1], restkey=SmartAttribute._fields[-1], delimiter=' ') for entry in reader: # We're only interested in the SMART attributes that are # whitelisted here. entry['name'] = entry['name'].lower() if entry['name'] not in smart_attributes_whitelist: continue # Ensure that only the numeric parts are fetched from the raw_value. # Attributes such as 194 Temperature_Celsius reported by my SSD # are in the format of "36 (Min/Max 24/40)" which can't be expressed # properly as a prometheus metric. m = re.match(r'^(\d+)', ' '.join(entry['raw_value'])) if not m: continue entry['raw_value'] = m.group(1) # Some device models report "---" in the threshold value where most # devices would report "000". We do the substitution here because # downstream code expects values to be convertable to integer. if entry['threshold'] == '---': entry['threshold'] = '0' if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: labels = { 'name': entry['name'], **device.base_labels, } for col in 'value', 'worst', 'threshold', 'raw_value': yield Metric( 'attr_{col}'.format(col=col), labels, entry[col]) seen.add(entry['name']) def collect_ata_error_count(device): """Inspect the device error log and report the amount of entries. Args: device: (Device) Device in question. Yields: (Metric) Device error count. """ error_log = smart_ctl( '-l', 'xerror,1', *device.smartctl_select(), check=False) m = ata_error_count_re.search(error_log) error_count = m.group(1) if m is not None else 0 yield Metric('device_errors', device.base_labels, error_count) def collect_disks_smart_metrics(wakeup_disks): now = int(datetime.datetime.utcnow().timestamp()) for device in find_devices(): yield Metric('smartctl_run', device.base_labels, now) is_active = device_is_active(device) yield Metric('device_active', device.base_labels, is_active) # Skip further metrics collection to prevent the disk from # spinning up. if not is_active and not wakeup_disks: continue yield from collect_device_info(device) smart_available, smart_enabled = device_smart_capabilities(device) yield Metric( 'device_smart_available', device.base_labels, smart_available) yield Metric( 'device_smart_enabled', device.base_labels, smart_enabled) # Skip further metrics collection here if SMART is disabled # on the device. Further smartctl invocations would fail # anyways. if not smart_available: continue yield from collect_device_health_self_assessment(device) if device.type.startswith('sat'): yield from collect_ata_metrics(device) yield from collect_ata_error_count(device) def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true') args = parser.parse_args(sys.argv[1:]) version_metric = Metric('smartctl_version', { 'version': smart_ctl_version() }, True) metric_print_meta(version_metric, 'smartmon_') metric_print(version_metric, 'smartmon_') metrics = list(collect_disks_smart_metrics(args.wakeup_disks)) metrics.sort(key=lambda i: i.name) previous_name = None for m in metrics: if m.name != previous_name: metric_print_meta(m, 'smartmon_') previous_name = m.name metric_print(m, 'smartmon_') if __name__ == '__main__': main() prometheus-node-exporter-collectors-0+git20211024.8eeeffb/smartmon.sh000077500000000000000000000170501413530142000254370ustar00rootroot00000000000000#!/bin/bash # Script informed by the collectd monitoring script for smartmontools (using smartctl) # by Samuel B. (c) 2012 # source at: http://devel.dob.sk/collectd-scripts/ # TODO: This probably needs to be a little more complex. The raw numbers can have more # data in them than you'd think. # http://arstechnica.com/civis/viewtopic.php?p=22062211 # Formatting done via shfmt -i 2 # https://github.com/mvdan/sh parse_smartctl_attributes_awk="$( cat <<'SMARTCTLAWK' $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { gsub(/-/, "_"); printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 } SMARTCTLAWK )" smartmon_attrs="$( cat <<'SMARTMONATTRS' airflow_temperature_cel command_timeout current_pending_sector end_to_end_error erase_fail_count g_sense_error_rate hardware_ecc_recovered host_reads_32mib host_reads_mib host_writes_32mib host_writes_mib load_cycle_count media_wearout_indicator nand_writes_1gib offline_uncorrectable power_cycle_count power_on_hours program_fail_cnt_total program_fail_count raw_read_error_rate reallocated_event_count reallocated_sector_ct reported_uncorrect runtime_bad_block sata_downshift_count seek_error_rate spin_retry_count spin_up_time start_stop_count temperature_case temperature_celsius temperature_internal total_lbas_read total_lbas_written udma_crc_error_count unsafe_shutdown_count unused_rsvd_blk_cnt_tot wear_leveling_count workld_host_reads_perc workld_media_wear_indic workload_minutes SMARTMONATTRS )" smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" parse_smartctl_attributes() { local disk="$1" local disk_type="$2" local labels="disk=\"${disk}\",type=\"${disk_type}\"" sed 's/^ \+//g' | awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | tr '[:upper:]' '[:lower:]' | grep -E "(${smartmon_attrs})" } parse_smartctl_scsi_attributes() { local disk="$1" local disk_type="$2" local labels="disk=\"${disk}\",type=\"${disk_type}\"" while read -r line; do attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" case "${attr_type}" in number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; esac done [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"242\"} ${lbas_written}" [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}" } parse_smartctl_info() { local -i smart_available=0 smart_enabled=0 smart_healthy= local disk="$1" disk_type="$2" local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' while read -r line; do info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" case "${info_type}" in Model_Family) model_family="${info_value}" ;; Device_Model) device_model="${info_value}" ;; Serial_Number) serial_number="${info_value}" ;; Firmware_Version) fw_version="${info_value}" ;; Vendor) vendor="${info_value}" ;; Product) product="${info_value}" ;; Revision) revision="${info_value}" ;; Logical_Unit_id) lun_id="${info_value}" ;; esac if [[ "${info_type}" == 'SMART_support_is' ]]; then case "${info_value:0:7}" in Enabled) smart_available=1; smart_enabled=1 ;; Availab) smart_available=1; smart_enabled=0 ;; Unavail) smart_available=0; smart_enabled=0 ;; esac fi if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then case "${info_value:0:6}" in PASSED) smart_healthy=1 ;; *) smart_healthy=0 ;; esac elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then case "${info_value:0:2}" in OK) smart_healthy=1 ;; *) smart_healthy=0 ;; esac fi done echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}" echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}" [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}" } output_format_awk="$( cat <<'OUTPUTAWK' BEGIN { v = "" } v != $1 { print "# HELP smartmon_" $1 " SMART metric " $1; print "# TYPE smartmon_" $1 " gauge"; v = $1 } {print "smartmon_" $0} OUTPUTAWK )" format_output() { sort | awk -F'{' "${output_format_awk}" } smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then exit fi device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" for device in ${device_list}; do disk="$(echo "${device}" | cut -f1 -d'|')" type="$(echo "${device}" | cut -f2 -d'|')" active=1 echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" # Check if the device is in a low-power mode /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" # Skip further metrics to prevent the disk from spinning up test ${active} -eq 0 && continue # Get the SMART information and health /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" # Get the SMART attributes case ${type} in sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; nvme*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; *) (>&2 echo "disk type is not sat, scsi, nvme or megaraid but ${type}") exit ;; esac done | format_output prometheus-node-exporter-collectors-0+git20211024.8eeeffb/storcli.py000077500000000000000000000254331413530142000253000ustar00rootroot00000000000000#!/usr/bin/env python3 """ Script to parse StorCLI's JSON output and expose MegaRAID health as Prometheus metrics. Tested against StorCLI 'Ver 1.14.12 Nov 25, 2014'. StorCLI reference manual: http://docs.avagotech.com/docs/12352476 Advanced Software Options (ASO) not exposed as metrics currently. JSON key abbreviations used by StorCLI are documented in the standard command output, i.e. when you omit the trailing 'J' from the command. Formatting done with YAPF: $ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py """ from __future__ import print_function from datetime import datetime import argparse import collections import json import os import shlex import subprocess DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as Prometheus metrics.""" VERSION = '0.0.3' storcli_path = '' metric_prefix = 'megaraid_' metric_list = {} metric_list = collections.defaultdict(list) def main(args): """ main """ global storcli_path storcli_path = args.storcli_path data = get_storcli_json('/cALL show all J') try: # All the information is collected underneath the Controllers key data = data['Controllers'] for controller in data: response = controller['Response Data'] handle_common_controller(response) if response['Version']['Driver Name'] == 'megaraid_sas': handle_megaraid_controller(response) elif response['Version']['Driver Name'] == 'mpt3sas': handle_sas_controller(response) except KeyError: pass print_all_metrics(metric_list) def handle_common_controller(response): (controller_index, baselabel) = get_basic_controller_info(response) controller_info_label = baselabel + ',model="{0}",serial="{1}",fwversion="{2}"'.format( str(response['Basics']['Model']).strip(), str(response['Basics']['Serial Number']).strip(), str(response['Version']['Firmware Version']).strip(), ) add_metric('controller_info', controller_info_label, 1) # Split up string to not trigger CodeSpell issues if 'ROC temperature(Degree Celc' + 'ius)' in response['HwCfg'].keys(): response['HwCfg']['ROC temperature(Degree Celsius)'] = response['HwCfg'].pop( 'ROC temperature(Degree Celc' + 'ius)' ) add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)'])) def handle_sas_controller(response): (controller_index, baselabel) = get_basic_controller_info(response) add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK')) add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) try: # The number of physical disks is half of the number of items in this dict # Every disk is listed twice - once for basic info, again for detailed info add_metric('physical_drives', baselabel, len(response['Physical Device Information'].keys()) / 2) except AttributeError: pass for key, basic_disk_info in response['Physical Device Information'].items(): if 'Detailed Information' in key: continue create_metrics_of_physical_drive(basic_disk_info[0], response['Physical Device Information'], controller_index) def handle_megaraid_controller(response): (controller_index, baselabel) = get_basic_controller_info(response) # BBU Status Optimal value is 0 for cachevault and 32 for BBU add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] in [0, 32])) add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) add_metric('scheduled_patrol_read', baselabel, int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) for cvidx, cvinfo in enumerate(response.get('Cachevault_Info', [])): add_metric('cv_temperature', baselabel + ',cvidx="' + str(cvidx) + '"', int(cvinfo['Temp'].replace('C', '')) ) time_difference_seconds = -1 system_time = datetime.strptime(response['Basics'].get('Current System Date/time'), "%m/%d/%Y, %H:%M:%S") controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'), "%m/%d/%Y, %H:%M:%S") if system_time and controller_time: time_difference_seconds = abs(system_time - controller_time).seconds add_metric('time_difference', baselabel, time_difference_seconds) # Make sure it doesn't crash if it's a JBOD setup if 'Drive Groups' in response.keys(): add_metric('drive_groups', baselabel, response['Drive Groups']) add_metric('virtual_drives', baselabel, response['Virtual Drives']) for virtual_drive in response['VD LIST']: vd_position = virtual_drive.get('DG/VD') drive_group, volume_group = -1, -1 if vd_position: drive_group = vd_position.split('/')[0] volume_group = vd_position.split('/')[1] vd_baselabel = 'controller="{0}",DG="{1}",VG="{2}"'.format(controller_index, drive_group, volume_group) vd_info_label = vd_baselabel + ',name="{0}",cache="{1}",type="{2}",state="{3}"'.format( str(virtual_drive.get('Name')).strip(), str(virtual_drive.get('Cache')).strip(), str(virtual_drive.get('TYPE')).strip(), str(virtual_drive.get('State')).strip()) add_metric('vd_info', vd_info_label, 1) add_metric('physical_drives', baselabel, response['Physical Drives']) if response['Physical Drives'] > 0: data = get_storcli_json('/cALL/eALL/sALL show all J') drive_info = data['Controllers'][controller_index]['Response Data'] for physical_drive in response['PD LIST']: create_metrics_of_physical_drive(physical_drive, drive_info, controller_index) def get_basic_controller_info(response): controller_index = response['Basics']['Controller'] baselabel = 'controller="{0}"'.format(controller_index) return (controller_index, baselabel) def create_metrics_of_physical_drive(physical_drive, detailed_info_array, controller_index): enclosure = physical_drive.get('EID:Slt').split(':')[0] slot = physical_drive.get('EID:Slt').split(':')[1] pd_baselabel = 'controller="{0}",enclosure="{1}",slot="{2}"'.format(controller_index, enclosure, slot) pd_info_label = pd_baselabel + \ ',disk_id="{0}",interface="{1}",media="{2}",model="{3}",DG="{4}",state="{5}"'.format( str(physical_drive.get('DID')).strip(), str(physical_drive.get('Intf')).strip(), str(physical_drive.get('Med')).strip(), str(physical_drive.get('Model')).strip(), str(physical_drive.get('DG')).strip(), str(physical_drive.get('State')).strip()) drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( slot) if enclosure == ' ': drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot) try: info = detailed_info_array[drive_identifier + ' - Detailed Information'] state = info[drive_identifier + ' State'] attributes = info[drive_identifier + ' Device attributes'] settings = info[drive_identifier + ' Policies/Settings'] add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) add_metric('pd_media_errors', pd_baselabel, state['Media Error Count']) add_metric('pd_other_errors', pd_baselabel, state['Other Error Count']) add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count']) add_metric('pd_smart_alerted', pd_baselabel, int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0]) add_metric('pd_commissioned_spare', pd_baselabel, int(settings['Commissioned Spare'] == 'Yes')) add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes')) pd_info_label += ',firmware="{0}"'.format(attributes['Firmware Revision'].strip()) if 'SN' in attributes: pd_info_label += ',serial="{0}"'.format(attributes['SN'].strip()) except KeyError: pass add_metric('pd_info', pd_info_label, 1) def add_metric(name, labels, value): global metric_list try: metric_list[name].append({ 'labels': labels, 'value': float(value), }) except ValueError: pass def print_all_metrics(metrics): for metric, measurements in metrics.items(): print('# HELP {0}{1} MegaRAID {2}'.format(metric_prefix, metric, metric.replace('_', ' '))) print('# TYPE {0}{1} gauge'.format(metric_prefix, metric)) for measurement in measurements: if measurement['value'] != 'Unknown': print('{0}{1}{2} {3}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}', measurement['value'])) def get_storcli_json(storcli_args): """Get storcli output in JSON format.""" # Check if storcli is installed and executable if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)): SystemExit(1) storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args) proc = subprocess.Popen( storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output_json = proc.communicate()[0] data = json.loads(output_json.decode("utf-8")) if data["Controllers"][0]["Command Status"]["Status"] != "Success": SystemExit(1) return data if __name__ == "__main__": PARSER = argparse.ArgumentParser( description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) PARSER.add_argument( '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary') PARSER.add_argument('--version', action='version', version='%(prog)s {0}'.format(VERSION)) ARGS = PARSER.parse_args() main(ARGS) prometheus-node-exporter-collectors-0+git20211024.8eeeffb/tw_cli.py000077500000000000000000000317001413530142000250740ustar00rootroot00000000000000#!/usr/bin/env python3 # # Prometheus node_exporter textfile collector for 3ware RAID controllers # # Half of it based on "Nagios Plugin for 3ware RAID" from "Hari Sekhon", # Ref: http://github.com/harisekhon/nagios-plugins # ... with additions for full info (-I) gathering # # (c) 2019, Nuno Tavares # # You can find the latest version at: # https://github.com/ntavares/node-exporter-textfile-collector-scripts # """Nagios plugin to test the state of all 3ware RAID arrays and / or drives on all 3ware controllers on the local machine. Requires the tw_cli program written by 3ware, which should be called tw_cli_64 if running on a 64-bit system. May be remotely executed via any of the standard remote nagios execution mechanisms""" import copy import os import re import sys from argparse import ArgumentParser from subprocess import Popen, PIPE, STDOUT __version__ = '0.1.0' BIN = None METRICS = {} METRIC_PREFIX = 'tw_cli' def exit_error(msg): print('{}_cli_error{{message="{}"}}\t1'.format(METRIC_PREFIX, msg)) sys.exit(1) def exit_clean(): global METRICS for mk, mv in METRICS.items(): print('{}_{}\t{}'.format(METRIC_PREFIX, mk, mv)) sys.exit(0) def add_metric(metric, labels, value): global METRICS labelstrs = [] for lk, lv in labels.items(): labelstrs += ['{}="{}"'.format(lk, lv)] labelstr = ','.join(labelstrs) METRICS[metric + '{' + labelstr + '}'] = str(value) def _set_twcli_binary(): """Set the path to the twcli binary""" global BIN BIN = '/usr/sbin/tw_cli' def run(cmd, stripOutput=True): """Runs a system command and returns stripped output""" if not cmd: exit_error("Internal python error - no cmd supplied for 3ware utility") try: process = Popen(BIN, stdin=PIPE, stdout=PIPE, stderr=STDOUT) except OSError as error: error = str(error) if error == "No such file or directory": exit_error("Cannot find 3ware utility '{}'".format(BIN)) else: exit_error("Error trying to run 3ware utility - {}".format(error)) if process.poll(): exit_error("3ware utility process ended prematurely") try: stdout, stderr = process.communicate(cmd) except OSError as error: exit_error("Unable to communicate with 3ware utility - {}".format(error)) if not stdout: exit_error("No output from 3ware utility") output = str(stdout).split('\n') # Strip command prompt, since we're running an interactive CLI shell output[0] = re.sub(r'//.*?> ', '', output[0]) if output[1] == "No controller found.": exit_error("No 3ware controllers were found on this machine") if process.returncode != 0: stderr = str(stdout).replace('\n', ' ') exit_error("3ware utility returned an exit code of {} - {}".format(process.returncode, stderr)) if stripOutput: return output[3:-2] return output def test_all(verbosity, warn_true=False): """Calls the RAID and drive testing functions""" test_arrays(verbosity, warn_true) test_drives(verbosity, warn_true) def test_arrays(verbosity, warn_true=False): """Tests all the RAID arrays on all the 3ware controllers on the local machine""" lines = run('show') # controllers = [line.split()[0] for line in lines] controllers = [line.split()[0] for line in lines if line.startswith('c')] for controller in controllers: unit_lines = run('/{} show unitstatus'.format(controller)) if verbosity >= 3: for unit_line in unit_lines: print(unit_line) print() for unit_line in unit_lines: unit_line = unit_line.split() state = unit_line[2] unit = int(unit_line[0][1:]) raid = unit_line[1] add_metric('array_info', {'controller': controller[1:], 'unit': unit, 'state': state, 'raid': raid}, 1) if state == 'OK': add_metric('array_status', {'controller': controller[1:], 'unit': unit, 'state': state}, 1) continue elif state in ('REBUILDING', 'VERIFY-PAUSED', 'VERIFYING', 'INITIALIZING'): if state in ('VERIFY-PAUSED', 'VERIFYING', 'INITIALIZING'): percent_complete = unit_line[4] else: percent_complete = unit_line[3] if warn_true: add_metric('array_status', {'controller': controller[1:], 'unit': unit, 'state': state, 'pct': percent_complete}, 0) else: add_metric('array_status', {'controller': controller[1:], 'unit': unit, 'state': state, 'pct': percent_complete}, 1) else: add_metric('array_status', {'controller': controller[1:], 'unit': unit, 'state': state}, 0) def test_drives(verbosity, warn_true=False): """Tests all the drives on the all the 3ware RAID controllers on the local machine""" lines = run('show') controllers = [] for line in lines: parts = line.split() if parts: controllers.append(parts[0]) for controller in controllers: drive_lines = run('/{} show drivestatus'.format(controller)) if verbosity >= 3: for drive_line in drive_lines: print(drive_line) print() for drive_line in drive_lines: drive_line = drive_line.split() state = drive_line[1] drive = drive_line[0] if drive[0] == 'd': drive = drive[1:] array = drive_line[2] if array[0] == 'u': array = array[1:] if state in ('OK', 'NOT-PRESENT'): add_metric('drive_status', {'controller': controller[1:], 'drive': drive, 'array': array, 'state': state}, 1) continue if not warn_true and state in ('VERIFYING', 'REBUILDING', 'INITIALIZING'): add_metric('drive_status', {'controller': controller[1:], 'drive': drive, 'array': array, 'state': state}, 1) continue else: add_metric('drive_status', {'controller': controller[1:], 'drive': drive, 'array': array, 'state': state}, 0) def _parse_temperature(val): result = re.split(r'(\d+)(.*)$', val) return result[1] def _parse_yes_ok_on(val): if val in ('OK', 'Yes', 'On'): return 1 return 0 def collect_details(cmdprefix, detailsMap, metric, injectedLabels, verbosity): """Generic function to parse key = value lists, based on a detailsMap which selects the fields to parse. injectedLabels is just baseline labels to be included. Note that the map may list both labels to append to a catchall 'metric', or individual metrics, whose name overrides 'metric' and will contain injectedLabels.""" lines = run('{} show all'.format(cmdprefix), False) labels = copy.copy(injectedLabels) for line in lines: if re.match('^' + cmdprefix + ' (.+?)= (.+?)$', line): if verbosity >= 3: print(line) result = re.split(r'\S+ (.+?)= (.+?)$', line) # print("RESULT:", str(result)) k = result[1].strip() v = result[2].strip() if k in detailsMap: if detailsMap[k]['parser']: v = detailsMap[k]['parser'](v) # If this field is meant for a separate metric, do it if 'metric' in detailsMap[k]: add_metric(detailsMap[k]['metric'], injectedLabels, v) else: labels[detailsMap[k]['label']] = v add_metric(metric, labels, 1) def collect_controller(verbosity): CTRL_DETAILS = { 'Model': {'label': 'model', 'parser': None}, 'Firmware Version': {'label': 'firmware', 'parser': None}, 'Bios Version': {'label': 'bios', 'parser': None}, 'Serial Number': {'label': 'serial', 'parser': None}, 'PCB Version': {'label': 'pcb', 'parser': None}, 'PCHIP Version': {'label': 'pchip', 'parser': None}, 'ACHIP Version': {'label': 'achip', 'parser': None}, } lines = run('show') controllers = [line.split()[0] for line in lines if line.startswith('c')] for controller in controllers: collect_details('/' + controller, CTRL_DETAILS, 'controller_info', {'controller': controller[1:]}, verbosity) collect_bbu(controller, verbosity) collect_drives(controller, verbosity) def collect_drives(controller, verbosity): DRIVE_DETAILS = { # 'Status': {'metric': 'drive_status', 'parser': _parse_yes_ok_on}, 'Reallocated Sectors': {'metric': 'drive_reallocated_sectors', 'parser': None}, 'Temperature': {'metric': 'drive_temperature', 'parser': _parse_temperature}, 'Model': {'label': 'model', 'parser': None}, 'Firmware Version': {'label': 'firmware', 'parser': None}, 'Serial': {'label': 'serial', 'parser': None}, 'Belongs to Unit': {'label': 'unit', 'parser': None}, 'Link Speed': {'label': 'linkspeed', 'parser': None}, } drive_lines = run('/' + controller + ' show drivestatus') for drive_line in drive_lines: drive_line = drive_line.split() drive = drive_line[0] collect_details('/' + controller + '/' + drive, DRIVE_DETAILS, 'drive_info', {'controller': controller[1:], 'drive': drive}, verbosity) def collect_bbu(controller, verbosity): BBU_DETAILS = { 'Firmware Version': {'label': 'firmware', 'parser': None}, 'Serial Number': {'label': 'serial', 'parser': None}, 'Bootloader Version': {'label': 'bootloader', 'parser': None}, 'PCB Revision': {'label': 'pcb', 'parser': None}, 'Battery Installation Date': {'label': 'since', 'parser': None}, 'Online State': {'metric': 'bbu_online', 'parser': _parse_yes_ok_on}, 'BBU Ready': {'metric': 'bbu_ready', 'parser': _parse_yes_ok_on}, 'BBU Status': {'metric': 'bbu_status', 'parser': _parse_yes_ok_on}, 'Battery Voltage status': {'metric': 'bbu_voltage_status', 'parser': _parse_yes_ok_on}, 'Battery Temperature Status': {'metric': 'bbu_temperature_status', 'parser': _parse_yes_ok_on}, 'Battery Temperature Value': {'metric': 'bbu_temperature', 'parser': _parse_temperature}, } collect_details('/' + controller + '/bbu', BBU_DETAILS, 'bbu_info', {'controller': controller[1:]}, verbosity) def main(): """Parses command line options and calls the function to test the arrays/drives""" parser = ArgumentParser() group = parser.add_mutually_exclusive_group() group.add_argument('-a', '--arrays-only', action='store_true', help="Only test the arrays (default: %(default)s)") group.add_argument('-d', '--drives-only', action='store_true', help="Only test the drives (default: %(default)s)") parser.add_argument('-I', '--info', action='store_true', dest='incl_info', help="Include detailed component info (default: %(default)s)") parser.add_argument('-w', '--warn-rebuilding', action='store_true', help="Warn when an array or disk is Rebuilding, Initializing or Verifying. " "You might want to do this to keep a closer eye on things. Also, these " "conditions can affect performance so you might want to know this is going " "on (default: %(default)s)") parser.add_argument('-v', '--verbose', action='count', dest='verbosity', help="Verbose mode. By default only one result line is printed as per " "Nagios standards") parser.add_argument('-V', '--version', action='version', version=__version__) args = parser.parse_args() if args.drives_only and args.warn_rebuilding: parser.error("You cannot use the -d and -w switches together. Array warning states are " "invalid when testing only drives.") if os.geteuid() != 0: exit_error("You must be root to run this plugin") _set_twcli_binary() if args.drives_only: test_drives(args.verbosity, args.warn_rebuilding) else: test_all(args.verbosity, args.warn_rebuilding) if args.incl_info: collect_controller(args.verbosity) exit_clean() if __name__ == '__main__': try: main() except KeyboardInterrupt: print("Caught Control-C...") sys.exit(1) prometheus-node-exporter-collectors-0+git20211024.8eeeffb/yum.sh000077500000000000000000000014231413530142000244060ustar00rootroot00000000000000#!/bin/bash # # Description: Expose metrics from yum updates. # # Author: Slawomir Gonet # # Based on apt.sh by Ben Kochie set -u -o pipefail # shellcheck disable=SC2016 filter_awk_script=' BEGIN { mute=1 } /Obsoleting Packages/ { mute=0 } mute && /^[[:print:]]+\.[[:print:]]+/ { print $3 } ' check_upgrades() { /usr/bin/yum -q check-update | /usr/bin/xargs -n3 | awk "${filter_awk_script}" | sort | uniq -c | awk '{print "yum_upgrades_pending{origin=\""$2"\"} "$1}' } upgrades=$(check_upgrades) echo '# HELP yum_upgrades_pending Yum package pending updates by origin.' echo '# TYPE yum_upgrades_pending gauge' if [[ -n "${upgrades}" ]] ; then echo "${upgrades}" else echo 'yum_upgrades_pending{origin=""} 0' fi