352 lines
12 KiB
Python
Executable File
352 lines
12 KiB
Python
Executable File
#!/usr/bin/python3
|
|
import argparse
|
|
import datetime
|
|
import glob
|
|
import json
|
|
import mailbox
|
|
import os
|
|
import shlex
|
|
import subprocess
|
|
import time
|
|
|
|
import apt_pkg
|
|
import dbus
|
|
import psutil
|
|
import requests
|
|
from cryptography import x509
|
|
from cryptography.hazmat.backends import default_backend
|
|
from git import Repo
|
|
from prometheus_client import CollectorRegistry, Gauge
|
|
from prometheus_client.exposition import generate_latest
|
|
from prometheus_client.parser import text_string_to_metric_families
|
|
from systemd import journal, login
|
|
|
|
JOURNALD_IGNORED_ERRORS = {
|
|
'dovecot.service': [
|
|
'Connection lost to LDAP server, reconnecting',
|
|
],
|
|
'ssh.service': [
|
|
'maximum authentication attempts exceeded for ',
|
|
'error: kex_exchange_identification: read: Connection reset by peer',
|
|
'error: kex_exchange_identification: Connection closed by remote host',
|
|
'error: kex_exchange_identification: banner line contains invalid characters',
|
|
'fatal: userauth_pubkey: parse publickey packet: incomplete message [preauth]',
|
|
'fatal: userauth_pubkey: parse request failed: incomplete message [preauth]',
|
|
'fatal: Timeout before authentication for ',
|
|
],
|
|
'': [ # match all services (useful for ovpn*)
|
|
'Connection reset, restarting [0]',
|
|
],
|
|
}
|
|
|
|
|
|
registry = CollectorRegistry()
|
|
eo_errors = Gauge('eo_errors', 'failed tests', ['ctn'], registry=registry)
|
|
eo_certificates = Gauge('eo_certificates', 'certificates', ['ctn', 'name'], registry=registry)
|
|
eo_debian = Gauge('eo_debian', 'debian os', ['ctn'], registry=registry)
|
|
eo_etckeeper = Gauge('eo_etckeeper', 'etckeeper', ['ctn', 'name'], registry=registry)
|
|
eo_exim = Gauge('eo_exim', 'exim', ['ctn', 'name'], registry=registry)
|
|
eo_harakiri = Gauge('eo_harakiri', 'harakiri', ['ctn', 'name', 'unit'], registry=registry)
|
|
eo_journal = Gauge('eo_journal', 'journald', ['ctn', 'name'], registry=registry)
|
|
eo_local_changes = Gauge('eo_local_changes', '', ['ctn'], registry=registry)
|
|
eo_mailboxes = Gauge('eo_mailboxes', 'emails in local mailboxes', ['ctn', 'name'], registry=registry)
|
|
eo_munin = Gauge('eo_munin', 'munin', ['ctn', 'name'], registry=registry)
|
|
eo_nginx = Gauge('eo_nginx', 'nginx', ['ctn', 'name'], registry=registry)
|
|
eo_postgresql = Gauge(
|
|
'eo_postgresql', 'postgresql', ['ctn', 'version', 'cluster', 'context', 'name'], registry=registry
|
|
)
|
|
eo_rabbitmq = Gauge('eo_rabbitmq', 'rabbitmq', ['ctn'], registry=registry)
|
|
eo_threads = Gauge('eo_threads', 'system threads', ['ctn'], registry=registry)
|
|
eo_units = Gauge('eo_units', 'systemd units', ['ctn', 'name', 'state'], registry=registry)
|
|
eo_packages = Gauge('eo_packages', 'packages', ['ctn', 'state'], registry=registry)
|
|
eo_kernel = Gauge('eo_kernel', 'kernel update', ['ctn'], registry=registry)
|
|
|
|
|
|
def run(cmd):
|
|
m = shlex.split(cmd)
|
|
p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
return '\n'.join([l.decode().rstrip() for l in p.stdout.readlines()])
|
|
|
|
|
|
def debian(ctn):
|
|
with open('/etc/debian_version') as fh:
|
|
content = fh.read()
|
|
if 'sid' in content:
|
|
# we shouldn't have non-stable versions
|
|
ve = 0
|
|
else:
|
|
ve = float(content.strip())
|
|
eo_debian.labels(ctn).set(ve)
|
|
|
|
|
|
def etckeeper(ctn):
|
|
rep = Repo('/etc')
|
|
eo_etckeeper.labels(ctn, 'dirty').set(int(rep.is_dirty()))
|
|
eo_etckeeper.labels(ctn, 'untracked').set(len(rep.untracked_files))
|
|
|
|
|
|
def exim(ctn):
|
|
if not os.path.exists('/usr/sbin/exim'):
|
|
return
|
|
with open('/var/log/exim4/mainlog', 'rb') as fh:
|
|
for line in fh.readlines():
|
|
if b' ** ' in line and not (
|
|
b'benjamin.dauvergne+eo@gmail.com' in line and b'support.google.com' in line
|
|
):
|
|
eo_exim.labels(ctn, 'errors').inc()
|
|
deferred = int(run('exim -bpc'))
|
|
eo_exim.labels(ctn, 'deferred').set(deferred)
|
|
|
|
|
|
def certificates(ctn):
|
|
certs = ['/etc/exim4/exim.crt']
|
|
for cert in certs:
|
|
if os.path.isfile(cert):
|
|
c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend())
|
|
remaining = (c.not_valid_after - datetime.datetime.today()).days
|
|
eo_certificates.labels(ctn, cert).set(remaining)
|
|
|
|
|
|
def harakiri(ctn):
|
|
j = journal.Reader()
|
|
fifteen = time.time() - 15 * 60
|
|
j.seek_realtime(fifteen)
|
|
for e in j:
|
|
if 'HARAKIRI ON WORKER' in e['MESSAGE']:
|
|
eo_harakiri.labels(ctn, 'errors', e['_SYSTEMD_UNIT']).inc()
|
|
|
|
|
|
def journald(ctn):
|
|
j = journal.Reader()
|
|
fifteen = time.time() - 15 * 60
|
|
j.seek_realtime(fifteen)
|
|
j.add_match(PRIORITY=3)
|
|
j.add_match(PRIORITY=2)
|
|
eo_journal.labels(ctn, "critical").set(0)
|
|
eo_journal.labels(ctn, "error").set(0)
|
|
priority_to_tag = {2: "critical", 3: "error"}
|
|
for e in j:
|
|
msg = e["MESSAGE"]
|
|
ignored_strings = (
|
|
JOURNALD_IGNORED_ERRORS.get(e.get('_SYSTEMD_UNIT'), []) + JOURNALD_IGNORED_ERRORS['']
|
|
)
|
|
for ignored_string in ignored_strings:
|
|
if ignored_string in msg:
|
|
break
|
|
else:
|
|
eo_journal.labels(ctn, priority_to_tag[e["PRIORITY"]]).inc()
|
|
if "Connected -> NetworkFailure" in msg or "task nfsd" in msg:
|
|
eo_journal.labels(ctn, "network_failure").inc()
|
|
|
|
|
|
def local_changes(ctn):
|
|
f = '/var/log/check-local-changes.log'
|
|
if os.path.isfile(f):
|
|
n = len(open(f).readlines())
|
|
eo_local_changes.labels(ctn).set(n)
|
|
|
|
|
|
def munin(ctn):
|
|
since = datetime.datetime.now() - datetime.timedelta(hours=1)
|
|
since_str = since.strftime('%Y/%m/%d-%H:%M:%S')
|
|
count = 0
|
|
for filename in ['/var/log/munin/munin-node.log', '/var/log/munin/munin-node.log.1']:
|
|
if os.path.isfile(filename):
|
|
count += len(
|
|
[
|
|
x
|
|
for x in open(filename).readlines()
|
|
if x > since_str and 'exited with status' in x
|
|
# do not report for known broken graphs
|
|
and not ('containers_cpu' in x or 'containers_io' in x or 'containers_memory' in x)
|
|
]
|
|
)
|
|
eo_munin.labels(ctn, 'errors').set(count)
|
|
|
|
|
|
def nginx(ctn):
|
|
try:
|
|
r = requests.get('http://localhost/nginx_status')
|
|
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
|
|
return
|
|
if r.ok:
|
|
for line in r.text.splitlines():
|
|
if 'Active connections' in line:
|
|
n = int(line.split(':')[1].strip())
|
|
eo_nginx.labels(ctn, 'connections').set(n)
|
|
|
|
|
|
def packages(ctn):
|
|
apt_pkg.init()
|
|
cache = apt_pkg.Cache(None)
|
|
depcache = apt_pkg.DepCache(cache)
|
|
depcache.read_pinfile()
|
|
depcache.init()
|
|
n = 0
|
|
for pkg in cache.packages:
|
|
if pkg.current_ver is not None and not depcache.marked_install(pkg) and depcache.is_upgradable(pkg):
|
|
n += 1
|
|
eo_packages.labels(ctn, 'upgradable').set(n)
|
|
|
|
|
|
def mailboxes(ctn):
|
|
if os.path.exists('/etc/dovecot/dovecot.conf'):
|
|
# skip servers where dovecot is installed as it's expected to have
|
|
# mailboxes there.
|
|
return
|
|
boxes = glob.glob('/var/spool/mail/*')
|
|
days_ago = time.time() - 30 * 86400
|
|
for m in boxes:
|
|
if not os.path.isfile(m):
|
|
continue
|
|
if not os.stat(m).st_mtime > days_ago:
|
|
# skip mailboxes that didn't change for a long time
|
|
continue
|
|
n = m.split('/')[-1]
|
|
c = len(mailbox.mbox(m))
|
|
eo_mailboxes.labels(ctn, n).set(c)
|
|
|
|
|
|
def postgresql(ctn):
|
|
if not os.path.exists('/usr/bin/pg_lsclusters'):
|
|
return
|
|
|
|
clusters = json.loads(run('pg_lsclusters --json'))
|
|
for cluster in clusters:
|
|
version = cluster['version']
|
|
name = cluster['cluster']
|
|
eo_postgresql.labels(ctn, version, name, '', 'running').set(cluster['running'])
|
|
if cluster['running']:
|
|
# check the archiver status
|
|
archiver_failures = run(
|
|
"sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'"
|
|
% cluster['port']
|
|
)
|
|
eo_postgresql.labels(ctn, version, name, '', 'archive_failed').set(int(archiver_failures))
|
|
|
|
if 'recovery' in cluster and cluster['recovery']:
|
|
# we are on a standby, check it's connected to a master
|
|
receiver_statuses = run(
|
|
"sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'"
|
|
% cluster['port']
|
|
)
|
|
eo_postgresql.labels(ctn, version, name, '', 'replicating').set(0)
|
|
for status in receiver_statuses.splitlines():
|
|
if status == 'streaming':
|
|
eo_postgresql.labels(ctn, version, name, '', 'replicating').inc()
|
|
else:
|
|
# we are on a primary... check the slots are good
|
|
slots = run(
|
|
"sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'"
|
|
% cluster['port']
|
|
)
|
|
active_slot_count = 0
|
|
for slot in slots.splitlines():
|
|
active, slot_name, delta = slot.split('|')
|
|
eo_postgresql.labels(ctn, version, name, slot_name, 'slot-active').set(active == 't')
|
|
eo_postgresql.labels(ctn, version, name, slot_name, 'slot-delta').set(int(delta))
|
|
if active == 't':
|
|
active_slot_count += 1
|
|
eo_postgresql.labels(ctn, version, name, '', 'active-slot-count').set(active_slot_count)
|
|
|
|
|
|
def rabbitmq(ctn):
|
|
rabbitmqctl = '/usr/sbin/rabbitmqctl'
|
|
if os.path.isfile(rabbitmqctl):
|
|
for i in run('%s list_queues messages' % rabbitmqctl):
|
|
if i.isdigit():
|
|
eo_rabbitmq.labels(ctn).inc(int(i))
|
|
|
|
|
|
def threads(ctn):
|
|
for p in psutil.process_iter():
|
|
eo_threads.labels(ctn).inc(p.num_threads())
|
|
|
|
|
|
def units(ctn):
|
|
bus = dbus.SystemBus()
|
|
s = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1')
|
|
manager = dbus.Interface(s, 'org.freedesktop.systemd1.Manager')
|
|
units = manager.ListUnits()
|
|
n = len([u for u in units if u[3] == 'failed' and not u[0].startswith('user@')])
|
|
name = [
|
|
u[0].replace('dbus.String', '') for u in units if u[3] == 'failed' and not u[0].startswith('user@')
|
|
]
|
|
if name:
|
|
eo_units.labels(ctn, name, 'failed').set(n)
|
|
else:
|
|
eo_units.labels(ctn, '', 'failed').set(n)
|
|
|
|
|
|
def run_in_machines(ctn):
|
|
for machine in login.machine_names():
|
|
r = run(
|
|
'systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-system-exporter.py --ctn %s'
|
|
% (machine, machine)
|
|
)
|
|
current_metrics = text_string_to_metric_families(r)
|
|
for m in current_metrics:
|
|
for s in m.samples:
|
|
metric = globals()[m.name]
|
|
metric.labels(**s.labels).set(s.value)
|
|
|
|
|
|
def check_kernel_version(ctn):
|
|
# no check if no kernel installed (aka container)
|
|
if not os.path.exists('/vmlinuz'):
|
|
return
|
|
|
|
uname = os.uname()
|
|
current_version = uname.version
|
|
current_release = uname.release
|
|
|
|
# first check, simple : verify that the current release match /vmlinuz
|
|
next_boot_release = os.path.basename(os.readlink('/vmlinuz'))
|
|
if next_boot_release != 'vmlinuz-%s' % current_release:
|
|
eo_kernel.labels(ctn).set(2)
|
|
return
|
|
|
|
# second check, a bit harder : verify that the current version matches the package
|
|
current_version_extract = current_version.split(' ')[3]
|
|
for line in subprocess.check_output(['dpkg', '--status', 'linux-image-' + current_release]).split(b'\n'):
|
|
if line.startswith(b'Version: '):
|
|
if line != b'Version: ' + current_version_extract.encode('ascii'):
|
|
eo_kernel.labels(ctn).set(1)
|
|
return
|
|
break
|
|
eo_kernel.labels(ctn).set(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--test', action='store_true', help='raise errors')
|
|
parser.add_argument('--ctn', default='')
|
|
args = parser.parse_args()
|
|
|
|
for test in [
|
|
certificates,
|
|
check_kernel_version,
|
|
debian,
|
|
etckeeper,
|
|
exim,
|
|
harakiri,
|
|
journald,
|
|
local_changes,
|
|
mailboxes,
|
|
munin,
|
|
nginx,
|
|
packages,
|
|
postgresql,
|
|
rabbitmq,
|
|
threads,
|
|
units,
|
|
run_in_machines,
|
|
]:
|
|
try:
|
|
test(args.ctn)
|
|
except Exception:
|
|
eo_errors.labels(ctn=args.ctn).inc()
|
|
if args.test:
|
|
raise
|
|
print(generate_latest(registry).decode())
|