publik-infra/prometheus-entrouvert-exporter/prometheus-entrouvert-expor...

207 lines
6.9 KiB
Python
Executable File

#!/usr/bin/python3
import apt
import argparse
from cryptography import x509
from cryptography.hazmat.backends import default_backend
import datetime
import dbus
import mailbox
from git import Repo
import glob
import os
import psutil
import requests
import shlex
import subprocess
from systemd import login, journal
import time
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.exposition import generate_latest
from prometheus_client.parser import text_string_to_metric_families
apt_cache = apt.Cache()
registry = CollectorRegistry()
eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry)
eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry)
eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry)
eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry)
eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry)
eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry)
eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_failed_units", "systemd failed units", ["ctn", "state"], registry=registry)
eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry)
eo_sessions = Gauge("eo_sessions", "sessions", ["ctn"], registry=registry)
def run(cmd):
m = shlex.split(cmd)
p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()])
def debian(ctn):
with open('/etc/debian_version') as fh:
ve = float(fh.read().strip())
eo_debian.labels(ctn).set(ve)
def etckeeper(ctn):
rep = Repo("/etc")
eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty()))
eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files))
def exim(ctn):
with open('/var/log/exim4/mainlog') as fh:
for line in fh.readlines():
if "**" in line:
eo_exim.labels(ctn, "errors").inc()
deferred = int(run("exim -bpc"))
eo_exim.labels(ctn, "deferred").set(deferred)
def certificates(ctn):
certs = ["/etc/exim4/exim.crt"]
for cert in certs:
if os.path.isfile(cert):
c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend())
remaining = (c.not_valid_after - datetime.datetime.today()).days
eo_certificates.labels(ctn, cert).set(remaining)
def journald(ctn):
j = journal.Reader()
fifteen = time.time() - 15 * 60
j.seek_realtime(fifteen)
j.add_match(PRIORITY=2)
eo_journal.labels(ctn, "critical").set(len(list(j)))
j.seek_realtime(fifteen)
j.add_match(PRIORITY=3)
for e in j:
eo_journal.labels(ctn, "error").inc()
msg = e["MESSAGE"]
if "Connected -> NetworkFailure" in msg or "task nfsd" in msg:
eo_journal.labels(ctn, "network_failure").inc()
def local_changes(ctn):
f = "/var/log/check-local-changes.log"
if os.path.isfile(f):
n = len(open(f).readlines())
eo_local_changes.labels(ctn).set(n)
def munin(ctn):
f = "/var/log/munin/munin-node.log"
if os.path.isfile(f):
n = len([l for l in open(f).readlines() if "rror" in l and not "mail_space" in l])
eo_munin.labels(ctn, "errors").set(n)
def nginx(ctn):
r = requests.get("http://localhost/nginx_status")
if r.ok:
for line in r.text:
if "Active connections" in line:
n = int(line.split(':')[1].strip())
eo_nginx.labels(ctn, "connections").set(n)
def packages(ctn):
n = 0
for pkg in apt_cache.get_changes():
if pkg.isUpgradable:
n += 1
eo_packages.labels(ctn, "upgradable").set(n)
def mailboxes(ctn):
boxes = glob.glob("/var/spool/mail/*")
for m in boxes:
n = m.split("/")[-1]
c = len(mailbox.mbox(m))
eo_mailboxes.labels(ctn, n).set(c)
def postgresql(ctn):
if "postgresql" not in apt_cache:
return
recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf")
if len(recovery) == 0:
role = "primary"
eo_postgresql.labels(ctn, role, "replicators").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and 'walsender' in cmd[0]:
eo_postgresql.labels(ctn, role, "replicators").inc()
else:
role = "secondary"
eo_postgresql.labels(ctn, role, "replicating").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and 'walreceiver' in cmd[0]:
eo_postgresql.labels(ctn, role, "replicating").inc()
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"
if os.path.isfile(rabbitmqctl):
for i in run("%s list_queues messages" % rabbitmqctl):
if i.isdigit():
eo_rabbitmq.labels(ctn).inc(int(i))
def sessions(ctn):
n = len(psutil.users())
eo_sessions.labels(ctn).set(n)
def threads(ctn):
for p in psutil.process_iter():
eo_threads.labels(ctn).inc(p.num_threads())
def units(ctn):
bus = dbus.SystemBus()
s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1")
manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager")
units = manager.ListUnits()
n = len([u for u in units if u[3] == "failed"])
eo_units.labels(ctn, "failed").set(n)
def run_in_machines(ctn):
for machine in login.machine_names():
r = run("systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-entrouvert-exporter.py --ctn %s"
% (machine, machine))
current_metrics = text_string_to_metric_families(r)
for m in current_metrics:
for s in m.samples:
metric = globals()[m.name]
metric.labels(**s.labels).set(s.value)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true", help="raise errors")
parser.add_argument("--ctn", default="")
args = parser.parse_args()
for test in [certificates, debian, etckeeper, exim, journald, local_changes, mailboxes, munin,
nginx, packages, postgresql, rabbitmq, sessions, threads, units, run_in_machines]:
try:
test(args.ctn)
except Exception:
eo_errors.labels(ctn=args.ctn).inc()
if args.test:
raise
print(generate_latest(registry).decode())