diff --git a/Makefile b/Makefile index 6201920..1baa974 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ NAME = publik-infra all: DIST_FILES = \ + prometheus-entrouvert-exporter \ sysadmin-doc clean: diff --git a/debian/control b/debian/control index 5cb97f9..28bbc1d 100644 --- a/debian/control +++ b/debian/control @@ -14,3 +14,16 @@ Package: publik-sysadmin-doc Architecture: all Depends: ${misc:Depends} Description: Documentation for doc-publik.entrouvert.com/guide-de-l-administrateur-systeme/ + +Package: prometheus-entrouvert-exporter +Architecture: all +Depends: ${misc:Depends}, + prometheus-node-exporter, + python3-apt, + python3-cryptography, + python3-dbus, + python3-git, + python3-prometheus-client, + python3-psutil, + python3-systemd +Description: usefull metrics exporter as node exporter textfile diff --git a/debian/prometheus-entrouvert-exporter.install b/debian/prometheus-entrouvert-exporter.install new file mode 100644 index 0000000..919ed87 --- /dev/null +++ b/debian/prometheus-entrouvert-exporter.install @@ -0,0 +1 @@ +prometheus-entrouvert-exporter/prometheus-entrouvert-exporter.py usr/bin diff --git a/debian/prometheus-entrouvert-exporter.service b/debian/prometheus-entrouvert-exporter.service new file mode 100644 index 0000000..886fb57 --- /dev/null +++ b/debian/prometheus-entrouvert-exporter.service @@ -0,0 +1,6 @@ +[Unit] +Description=Collect Entrouvert metrics + +[Service] +Type=oneshot +ExecStart=/bin/bash -c "/usr/bin/prometheus-entrouvert-exporter.py > /var/lib/prometheus/node-exporter/entrouvert.prom" diff --git a/debian/prometheus-entrouvert-exporter.timer b/debian/prometheus-entrouvert-exporter.timer new file mode 100644 index 0000000..f8ccdd1 --- /dev/null +++ b/debian/prometheus-entrouvert-exporter.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run entrouvert metrics collection every 15 minutes + +[Timer] +OnBootSec=0 +OnUnitActiveSec=15min + +[Install] +WantedBy=timers.target diff --git a/prometheus-entrouvert-exporter/prometheus-entrouvert-exporter.py b/prometheus-entrouvert-exporter/prometheus-entrouvert-exporter.py new file mode 100755 index 0000000..6e4e4a0 --- /dev/null +++ b/prometheus-entrouvert-exporter/prometheus-entrouvert-exporter.py @@ -0,0 +1,206 @@ +#!/usr/bin/python3 +import apt +import argparse +from cryptography import x509 +from cryptography.hazmat.backends import default_backend +import datetime +import dbus +import mailbox +from git import Repo +import glob +import os +import psutil +import requests +import shlex +import subprocess +from systemd import login, journal +import time + +from prometheus_client import CollectorRegistry, Gauge +from prometheus_client.exposition import generate_latest +from prometheus_client.parser import text_string_to_metric_families + + +apt_cache = apt.Cache() +registry = CollectorRegistry() +eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry) +eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry) +eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry) +eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry) +eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry) +eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry) +eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry) +eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry) +eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry) +eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry) +eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry) +eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry) +eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry) +eo_units = Gauge("eo_failed_units", "systemd failed units", ["ctn", "state"], registry=registry) +eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry) +eo_sessions = Gauge("eo_sessions", "sessions", ["ctn"], registry=registry) + + +def run(cmd): + m = shlex.split(cmd) + p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()]) + + +def debian(ctn): + with open('/etc/debian_version') as fh: + ve = float(fh.read().strip()) + eo_debian.labels(ctn).set(ve) + + +def etckeeper(ctn): + rep = Repo("/etc") + eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty())) + eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files)) + + +def exim(ctn): + with open('/var/log/exim4/mainlog') as fh: + for line in fh.readlines(): + if "**" in line: + eo_exim.labels(ctn, "errors").inc() + deferred = int(run("exim -bpc")) + eo_exim.labels(ctn, "deferred").set(deferred) + + +def certificates(ctn): + certs = ["/etc/exim4/exim.crt"] + for cert in certs: + if os.path.isfile(cert): + c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend()) + remaining = (c.not_valid_after - datetime.datetime.today()).days + eo_certificates.labels(ctn, cert).set(remaining) + + +def journald(ctn): + j = journal.Reader() + fifteen = time.time() - 15 * 60 + j.seek_realtime(fifteen) + j.add_match(PRIORITY=2) + eo_journal.labels(ctn, "critical").set(len(list(j))) + j.seek_realtime(fifteen) + j.add_match(PRIORITY=3) + for e in j: + eo_journal.labels(ctn, "error").inc() + msg = e["MESSAGE"] + if "Connected -> NetworkFailure" in msg or "task nfsd" in msg: + eo_journal.labels(ctn, "network_failure").inc() + + +def local_changes(ctn): + f = "/var/log/check-local-changes.log" + if os.path.isfile(f): + n = len(open(f).readlines()) + eo_local_changes.labels(ctn).set(n) + + +def munin(ctn): + f = "/var/log/munin/munin-node.log" + if os.path.isfile(f): + n = len([l for l in open(f).readlines() if "rror" in l and not "mail_space" in l]) + eo_munin.labels(ctn, "errors").set(n) + + +def nginx(ctn): + r = requests.get("http://localhost/nginx_status") + if r.ok: + for line in r.text: + if "Active connections" in line: + n = int(line.split(':')[1].strip()) + eo_nginx.labels(ctn, "connections").set(n) + + +def packages(ctn): + n = 0 + for pkg in apt_cache.get_changes(): + if pkg.isUpgradable: + n += 1 + eo_packages.labels(ctn, "upgradable").set(n) + + +def mailboxes(ctn): + boxes = glob.glob("/var/spool/mail/*") + for m in boxes: + n = m.split("/")[-1] + c = len(mailbox.mbox(m)) + eo_mailboxes.labels(ctn, n).set(c) + + +def postgresql(ctn): + if "postgresql" not in apt_cache: + return + recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf") + if len(recovery) == 0: + role = "primary" + eo_postgresql.labels(ctn, role, "replicators").set(0) + for p in psutil.process_iter(): + cmd = p.cmdline() + if cmd and 'walsender' in cmd[0]: + eo_postgresql.labels(ctn, role, "replicators").inc() + else: + role = "secondary" + eo_postgresql.labels(ctn, role, "replicating").set(0) + for p in psutil.process_iter(): + cmd = p.cmdline() + if cmd and 'walreceiver' in cmd[0]: + eo_postgresql.labels(ctn, role, "replicating").inc() + + +def rabbitmq(ctn): + rabbitmqctl = "/usr/sbin/rabbitmqctl" + if os.path.isfile(rabbitmqctl): + for i in run("%s list_queues messages" % rabbitmqctl): + if i.isdigit(): + eo_rabbitmq.labels(ctn).inc(int(i)) + + +def sessions(ctn): + n = len(psutil.users()) + eo_sessions.labels(ctn).set(n) + + +def threads(ctn): + for p in psutil.process_iter(): + eo_threads.labels(ctn).inc(p.num_threads()) + + +def units(ctn): + bus = dbus.SystemBus() + s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") + manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager") + units = manager.ListUnits() + n = len([u for u in units if u[3] == "failed"]) + eo_units.labels(ctn, "failed").set(n) + + +def run_in_machines(ctn): + for machine in login.machine_names(): + r = run("systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-entrouvert-exporter.py --ctn %s" + % (machine, machine)) + current_metrics = text_string_to_metric_families(r) + for m in current_metrics: + for s in m.samples: + metric = globals()[m.name] + metric.labels(**s.labels).set(s.value) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--test", action="store_true", help="raise errors") + parser.add_argument("--ctn", default="") + args = parser.parse_args() + + for test in [certificates, debian, etckeeper, exim, journald, local_changes, mailboxes, munin, + nginx, packages, postgresql, rabbitmq, sessions, threads, units, run_in_machines]: + try: + test(args.ctn) + except Exception: + eo_errors.labels(ctn=args.ctn).inc() + if args.test: + raise + print(generate_latest(registry).decode())