#!/usr/bin/python3 import apt import argparse from cryptography import x509 from cryptography.hazmat.backends import default_backend import datetime import dbus import mailbox from git import Repo import glob import json import os import psutil import requests import shlex import subprocess from systemd import login, journal import time from prometheus_client import CollectorRegistry, Gauge from prometheus_client.exposition import generate_latest from prometheus_client.parser import text_string_to_metric_families JOURNALD_IGNORED_ERRORS = { 'dovecot.service': [ 'Connection lost to LDAP server, reconnecting', ], 'ssh.service': [ 'maximum authentication attempts exceeded for ', ], '': [ # match all services (useful for ovpn*) 'Connection reset, restarting [0]', ], } apt_cache = apt.Cache() registry = CollectorRegistry() eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry) eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry) eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry) eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry) eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry) eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry) eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry) eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry) eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry) eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry) eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry) eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry) eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry) eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry) eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry) def run(cmd): m = shlex.split(cmd) p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()]) def debian(ctn): with open('/etc/debian_version') as fh: content = fh.read() if 'sid' in content: # we shouldn't have non-stable versions ve = 0 else: ve = float(content.strip()) eo_debian.labels(ctn).set(ve) def etckeeper(ctn): rep = Repo("/etc") eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty())) eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files)) def exim(ctn): if not os.path.exists('/usr/sbin/exim'): return with open('/var/log/exim4/mainlog', 'rb') as fh: for line in fh.readlines(): if b" ** " in line and not ( b'benjamin.dauvergne+eo@gmail.com' in line and b'support.google.com' in line ): eo_exim.labels(ctn, "errors").inc() deferred = int(run("exim -bpc")) eo_exim.labels(ctn, "deferred").set(deferred) def certificates(ctn): certs = ["/etc/exim4/exim.crt"] for cert in certs: if os.path.isfile(cert): c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend()) remaining = (c.not_valid_after - datetime.datetime.today()).days eo_certificates.labels(ctn, cert).set(remaining) def journald(ctn): j = journal.Reader() fifteen = time.time() - 15 * 60 j.seek_realtime(fifteen) j.add_match(PRIORITY=2) eo_journal.labels(ctn, "critical").set(len(list(j))) j.seek_realtime(fifteen) j.add_match(PRIORITY=3) for e in j: msg = e["MESSAGE"] ignored_message = False ignored_strings = ( JOURNALD_IGNORED_ERRORS.get(e.get('_SYSTEMD_UNIT'), []) + JOURNALD_IGNORED_ERRORS[''] ) for ignored_string in ignored_strings: if ignored_string in msg: ignored_message = True break if ignored_message: continue eo_journal.labels(ctn, "error").inc() if "Connected -> NetworkFailure" in msg or "task nfsd" in msg: eo_journal.labels(ctn, "network_failure").inc() def local_changes(ctn): f = "/var/log/check-local-changes.log" if os.path.isfile(f): n = len(open(f).readlines()) eo_local_changes.labels(ctn).set(n) def munin(ctn): since = datetime.datetime.now() - datetime.timedelta(hours=1) since_str = since.strftime("%Y/%m/%d-%H:%M:%S") count = 0 for filename in ["/var/log/munin/munin-node.log", "/var/log/munin/munin-node.log.1"]: if os.path.isfile(filename): count += len( [x for x in open(filename).readlines() if x > since_str and "exited with status" in x] ) eo_munin.labels(ctn, "errors").set(count) def nginx(ctn): try: r = requests.get("http://localhost/nginx_status") except (requests.exceptions.SSLError, requests.exceptions.ConnectionError): return if r.ok: for line in r.text.splitlines(): if "Active connections" in line: n = int(line.split(':')[1].strip()) eo_nginx.labels(ctn, "connections").set(n) def packages(ctn): n = 0 for pkg in apt_cache.get_changes(): if pkg.isUpgradable: n += 1 eo_packages.labels(ctn, "upgradable").set(n) def mailboxes(ctn): if os.path.exists('/etc/dovecot/dovecot.conf'): # skip servers where dovecot is installed as it's expected to have # mailboxes there. return boxes = glob.glob("/var/spool/mail/*") days_ago = time.time() - 30 * 86400 for m in boxes: if not os.path.isfile(m): continue if not os.stat(m).st_mtime > days_ago: # skip mailboxes that didn't change for a long time continue n = m.split("/")[-1] c = len(mailbox.mbox(m)) eo_mailboxes.labels(ctn, n).set(c) def postgresql(ctn): if not os.path.exists("/usr/bin/pg_lsclusters"): return clusters = json.loads(run("pg_lsclusters --json")) for cluster in clusters: version = cluster["version"] name = cluster["cluster"] eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"]) if cluster["running"]: # check the archiver status archiver_failures = run("sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'" % cluster["port"]) eo_postgresql.labels(ctn, version, name, "", "archive_failed").set(int(archiver_failures)) if "recovery" in cluster and cluster["recovery"]: # we are on a standby, check it's connected to a master receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"]) eo_postgresql.labels(ctn, version, name, "", "replicating").set(0) for status in receiver_statuses.splitlines(): if status == "streaming": eo_postgresql.labels(ctn, version, name, "", "replicating").inc() else: # we are on a primary... check the slots are good slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"]) active_slot_count = 0 for slot in slots.splitlines(): active, slot_name, delta = slot.split("|") eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't') eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta)) if active == 't': active_slot_count += 1 eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count) def rabbitmq(ctn): rabbitmqctl = "/usr/sbin/rabbitmqctl" if os.path.isfile(rabbitmqctl): for i in run("%s list_queues messages" % rabbitmqctl): if i.isdigit(): eo_rabbitmq.labels(ctn).inc(int(i)) def threads(ctn): for p in psutil.process_iter(): eo_threads.labels(ctn).inc(p.num_threads()) def units(ctn): bus = dbus.SystemBus() s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager") units = manager.ListUnits() n = len([u for u in units if u[3] == "failed" and not u[0].startswith("user@")]) name = [ u[0].replace('dbus.String', '') for u in units if u[3] == "failed" and not u[0].startswith("user@") ] if name: eo_units.labels(ctn, name, "failed").set(n) else: eo_units.labels(ctn, "", "failed").set(n) def run_in_machines(ctn): for machine in login.machine_names(): r = run( "systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-system-exporter.py --ctn %s" % (machine, machine) ) current_metrics = text_string_to_metric_families(r) for m in current_metrics: for s in m.samples: metric = globals()[m.name] metric.labels(**s.labels).set(s.value) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--test", action="store_true", help="raise errors") parser.add_argument("--ctn", default="") args = parser.parse_args() for test in [ certificates, debian, etckeeper, exim, journald, local_changes, mailboxes, munin, nginx, packages, postgresql, rabbitmq, threads, units, run_in_machines, ]: try: test(args.ctn) except Exception: eo_errors.labels(ctn=args.ctn).inc() if args.test: raise print(generate_latest(registry).decode())