pax_global_header00006660000000000000000000000064123530733340014515gustar00rootroot0000000000000052 comment=bd5d834cfa719cfadc9473353e21469f0f5398b5 syslog-nagios-bridge/000077500000000000000000000000001235307333400151115ustar00rootroot00000000000000syslog-nagios-bridge/COPYING000066400000000000000000000015051235307333400161450ustar00rootroot00000000000000# syslog-nagios-bridge - transfer Syslog events to Nagios checkresults file # # Project page: https://github.com/dpocock/python-netsyslog # # Copyright (C) 2014 Daniel Pocock http://danielpocock.com # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . syslog-nagios-bridge/README.txt000066400000000000000000000047251235307333400166170ustar00rootroot00000000000000 syslog-nagios-bridge Copyright (C) 2014 Daniel Pocock http://danielpocock.com https://github.com/dpocock/syslog-nagios-bridge Dependencies ------------ python-netsyslog https://github.com/dpocock/python-netsyslog pynag http://pynag.org (using latest code from Git, June 2014, with the new Utils.CheckResult support) Installation ------------ Copy syslog-nagios-bridge.py to a suitable location (e.g. /usr/local/bin) Copy the configuration file to a suitable location (e.g. /etc/nagios3) Update the config file settings Edit your syslog daemon, tell it to send events to syslog-nagios-bridge over TCP. For example, append the following to /etc/rsyslog.conf: # for rsyslog >= v7.x: #action(type="omfwd" Target="127.0.0.1" Port="30514" Protocol="tcp" TCP_Framing="octet-counted") # for rsyslog < v7.x *.* @@127.0.0.1:30514 Make sure the port number matches the "bind_port" in syslog-bridge.conf syslog-nagios-bridge.py automatically creates service definitions for each syslog tag name that it detects. It can put them directly into the nagios configuration directories or it can generate them in some other place and you can copy them over manually. In any case, for Nagios to report on a particular host/syslog tag, there must be a corresponding service definition in /etc/nagios3/conf.d/whatever.cfg. To alert on errors from the httpd process, you may use the following: define service{ use generic-service host_name myhost service_description httpd - SysLog check_command return-unknown active_checks_enabled 0 passive_checks_enabled 1 max_check_attempts 1 } After doing the configuration, start the bridge and restart/reload the syslog daemon and Nagios itself: # su - nagios -c /usr/local/bin/syslog-nagios-bridge.py # service rsyslog restart # service nagios3 reload The relevant services will go into the CRITICAL state after error events are detected by syslog-nagios-bridge. Nagios has no way to know when the logs have been checked and whether anybody has taken action to correct the errors. Consequently, the services will remain in the CRITICAL state indefinitely. A user must go into the Nagios web interface and use the option "Submit passive check result for this service" to put the service back in the OK state. Normally this is only done after manually investigating the error. syslog-nagios-bridge/syslog-bridge.conf000066400000000000000000000037131235307333400205360ustar00rootroot00000000000000 # Log file for our own activity. If not specified, syslog is used. # (take care to avoid a feedback loop!) log_file = "/var/log/nagios3/syslog-bridge.log" # Default level is WARNING # (take care to avoid a feedback loop!) #log_level = logging.DEBUG # The TCP port where we listen for syslog events bind_port = 30514 # The check_result_path configured in nagios.cfg: checkresult_dir = "/var/lib/nagios3/spool/checkresults" # A directory where generated service defintion files can be # placed. svc_def_dir = "/etc/nagios3/syslog.d" # Generated service definitions should inherit from this template: svc_tmpl = "generic-service" # Specify a check command for inclusion in the service definition. # The command is never actually executed as the service checks are # only passive. Can be a simple script that always returns 3 (UNKNOWN) svc_check_dummy = "return-unknown" # Threshold for syslog events to generate Nagios checkresults # Choose between LOG_WARNING or LOG_ERR #svc_state_threshold = syslog.LOG_WARNING svc_state_threshold = syslog.LOG_ERR # We want to avoid flooding Nagios with multiple checkresults for a single # service if there are hundreds of log entries per second. # Therefore, after submitting a checkresult, we ignore any further events # for the same service during the subsequent period svc_submission_interval # (in seconds) svc_submission_interval = 10 # RFC3164 hostnames usually do not have the domain part. # RFC5424 hostnames usually do have the domain part (FQDN). # In Nagios, people normally use the short name/alias and not the FQDN. # Setting this option ensures that hostnames are normalized for submission # to Nagios: hostname_strip_fqdn = True # If LogAnalyzer is available, the Nagios service defintions can include # a link to the specific log query for the given host/tag. This link # will be displayed in the Nagios web UI so the user can click to # go directly from Nagios to LogAnalyzer. #loganalyzer_url = "http://log-host/loganalyzer/" syslog-nagios-bridge/syslog-nagios-bridge.py000077500000000000000000000211311235307333400215140ustar00rootroot00000000000000#!/usr/bin/python # # syslog-nagios-bridge - transfer Syslog events to Nagios checkresults file # # Project page: https://github.com/dpocock/python-netsyslog # # Copyright (C) 2014 Daniel Pocock http://danielpocock.com # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################ import argparse import logging import logging.handlers import netsyslog import os import Queue import re from threading import Thread import sys import syslog import time import urllib from pynag.Utils import CheckResult q = Queue.Queue() hosts = {} # default values (set from the config file) log_file = None log_level = logging.WARNING loganalyzer_url = None # This is a subclass of the SyslogTCPHandler from the netsyslog module. # It receives a notification (call to handle_message) each time a # syslog event arrives from the network and it puts them into a queue # for processing on the main thread. class MyHandler(netsyslog.SyslogTCPHandler): def handle_message(self, frame): """Handle parsed Syslog frames. """ logger.debug("severity: %d, facility: %d, tag: %s, PID: %s, host: %s, ts: %s, content: %s" % (frame.pri.severity, frame.pri.facility, frame.msg.tag, frame.msg.pid, frame.header.hostname, frame.header.timestamp, frame.msg.content)) # queue the frame for examination by the main thread q.put(frame) # make sure host names don't contain domain parts # (some bad syslog implementations send domain parts) # normalize to lowercase def clean_host_name(hostname): if hostname is None: return None if hostname == "" or hostname == "-": return None if hostname_strip_fqdn: return hostname.split(".")[0].lower() return hostname # make sure tag names don't contain illegal characters def clean_tag_name(tag): if tag is None: return None if tag == "" or tag == "-": return None # FIXME - use something more efficient than a regular expression _tag = re.sub(r"\W+", "", tag) if tag != _tag: logger.warning("detected invalid tag name: %s" % tag) return _tag def make_desc(hostname, tag): """Create a service description name. Should return names that exactly match the service descriptions in the Nagios configuration. """ return tag + " - SysLog" def lookup_app(hostname, tag): """Lookup the properties for the tag/application. Look through our in-memory cache for properties related to the tag/application on the given host. """ logger = logging.getLogger(__name__) if not hostname in hosts.keys(): _host = {} hosts[hostname] = _host logger.debug("first event from host: " + hostname) else: _host = hosts[hostname] if not tag in _host.keys(): _app = {} _host[tag] = _app logger.debug("first event from tag: " + tag) if svc_def_dir is not None: # see if we need to create a service defintion for the tag _filename = "syslog_%s_%s.cfg" % (hostname, tag) svc_def_filename = os.path.join(svc_def_dir, _filename) if not os.path.exists(svc_def_filename): logger.debug("creating service def for host %s, tag %s" % (hostname, tag)) # FIXME: can pynag create the service def through the API? with open(svc_def_filename, "w") as f: svc_desc = make_desc(hostname, tag) f.write("define service{\n") f.write(" use %s\n" % svc_tmpl) f.write(" host_name %s\n" % hostname) f.write(" service_description %s\n" % svc_desc) f.write(" # this is never really executed because active_checks_enabled=0:\n") f.write(" check_command %s\n" % svc_check_dummy) f.write(" active_checks_enabled 0\n") f.write(" passive_checks_enabled 1\n") f.write(" # generate email notifications after first error:\n") f.write(" max_check_attempts 1\n") if loganalyzer_url is not None: search_query="syslogtag:=%s source:=%s" % (tag, hostname) action_url = "%s?filter=%s" % (loganalyzer_url, urllib.quote(search_query)) f.write(" action_url %s\n" % action_url) f.write(" }\n") else: _app = _host[tag] return _app def handle_frame(frame): """Handle a SysLog event. Looks at the event to decide if it should generate a Nagios checkresult. """ logger = logging.getLogger(__name__) # Get the hostname and tag, lookup the properties for this pair: _hostname = clean_host_name(frame.header.hostname) if _hostname is None: logger.debug("bad or missing hostname, ignoring message") return _tag = clean_tag_name(frame.msg.tag) if _tag is None: logger.debug("bad or missing tag, ignoring message") return _app = lookup_app(_hostname, _tag) # Check if we need to notify Nagios if frame.pri.severity <= svc_state_threshold: if "last_event" in _app.keys(): if (_app["last_event"] + svc_submission_interval) > time.time(): # ignore multiple error events with svc_submission_interval # seconds after the last checkresult was sent to Nagios return logger.debug("Must tell Nagios") check_result = CheckResult(checkresult_dir) desc = make_desc(_hostname, _tag) output = "PID=%s, logged: %s" % (frame.msg.pid, frame.msg.content) if frame.pri.severity == syslog.LOG_WARNING: ret = 1 else: # for LOG_ERR and worse ret = 2 check_result.service_result( _hostname, desc, return_code=ret, output=output, check_type=1, check_options=0, scheduled_check=0, reschedule_check=0, latency=0.1, exited_ok=1) check_result.submit() _app["last_event"] = time.time() # main program code if __name__ == '__main__': try: # parse command line parser = argparse.ArgumentParser(description="receive Syslog events and generate Nagios check results file") parser.add_argument( "config_file", nargs="?", help="configuration file", default="/etc/nagios3/syslog-bridge.conf") args = parser.parse_args() # read the configuration file execfile(args.config_file) # Setup logging. # *** Be careful not to create a feedback loop *** logger = logging.getLogger() if log_file is not None: logger.addHandler(logging.FileHandler(log_file)) else: logger.addHandler(logging.handlers.SysLogHandler()) logger.setLevel(log_level) # Run the Collector in a thread to listen for incoming connections c = netsyslog.Collector(bind_port, MyHandler) thread = Thread(target = c.run) thread.daemon = True thread.start() while True: try: # we set a timeout for Queue.get() so that it can be # interrupted by ctrl-C. See issue no. 1360 # http://bugs.python.org/issue1360 frame = q.get(True, 1) logger.debug("got a frame from the queue") try: handle_frame(frame) except Exception as e: logger.error("Failed to handle an event: %s" % e) except Queue.Empty: pass except Exception as e: logging.error("Unexpected failure: %s" % e)