Add free disk space check to health-checker.py.

Move load-average check to health-checker.py.
This commit is contained in:
Ivaylo Novakov 2020-09-07 17:56:47 +02:00
parent ff56990faa
commit 0838e4f5e5
No known key found for this signature in database
GPG Key ID: 06B9354AB08BE9C6
2 changed files with 51 additions and 16 deletions

View File

@ -4,6 +4,7 @@ import asyncio
import io import io
import json import json
import os import os
import re
import sys import sys
import traceback import traceback
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -26,6 +27,12 @@ if len(sys.argv) > 3:
# a lower limit in order to leave some space for additional message text. # a lower limit in order to leave some space for additional message text.
DISCORD_MAX_MESSAGE_LENGTH = 1900 DISCORD_MAX_MESSAGE_LENGTH = 1900
GB = 1 << 20 # converts from KiB to GiB
# We are going to issue Discord warnings if the free space on a server falls
# under this threshold.
FREE_DISK_SPACE_THRESHOLD = 50 * GB # 50 GiB
bot_token = setup() bot_token = setup()
client = discord.Client() client = discord.Client()
@ -45,6 +52,8 @@ async def on_ready():
async def run_checks(): async def run_checks():
print("Running Skynet portal health checks") print("Running Skynet portal health checks")
try: try:
await check_load_average()
await check_disk()
await check_health() await check_health()
except: except:
trace = traceback.format_exc() trace = traceback.format_exc()
@ -57,6 +66,47 @@ async def run_checks():
force_notify=True) force_notify=True)
# check_load_average monitors the system's load average value and issues a
# warning message if it exceeds 10.
async def check_load_average():
uptime_string = os.popen("uptime").read().strip()
if sys.platform == "Darwin":
pattern = "^.*load averages: \d*\.\d* \d*\.\d* (\d*\.\d*)$"
else:
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10:
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
# check_disk checks the amount of free space on the /home partition and issues
# a warning message if it's under 10GB.
async def check_disk():
df = os.popen("df --block-size=1024").read().strip()
volumes = {}
for line in df.split("\n")[1:]:
fields = list(filter(None, line.split(" ")))
# -1 is "mounted on", 3 is "available space"
volumes[fields[-1]] = fields[3]
# List of mount point, longest to shortest. We'll use that to find the best
# fit for the volume we want to check.
mount_points = sorted(volumes.keys(), key=len, reverse=True)
wd = os.popen("pwd").read().strip()
vol = ""
for mp in mount_points:
if wd.startswith(mp):
vol = mp
break
if vol == "":
msg = "Failed to check free disk space! Didn't find a suitable mount point to check.\ndf output:\n{}".format(df)
await send_msg(client, msg)
return
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
free_space_gb = "{:.2f}".format(int(volumes[vol])/ GB)
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True)
return
# check_health checks /health-check endpoint and reports recent issues # check_health checks /health-check endpoint and reports recent issues
async def check_health(): async def check_health():
print("\nChecking portal health status...") print("\nChecking portal health status...")

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import discord, sys, traceback, io, os, asyncio, re import discord, sys, traceback, io, os, asyncio
from bot_utils import setup, send_msg from bot_utils import setup, send_msg
from datetime import datetime, timedelta from datetime import datetime, timedelta
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
@ -51,27 +51,12 @@ async def on_ready():
async def run_checks(): async def run_checks():
print("Running Skynet portal log checks") print("Running Skynet portal log checks")
try: try:
await check_load_average()
await check_docker_logs() await check_docker_logs()
except: # catch all exceptions except: # catch all exceptions
trace = traceback.format_exc() trace = traceback.format_exc()
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
# check_load_average monitors the system's load average value and issues a
# warning message if it exceeds 10.
async def check_load_average():
uptime_string = os.popen("uptime").read().strip()
# pattern = ""
if sys.platform == "Darwin":
pattern = "^.*load averages: \d*\.\d* \d*\.\d* (\d*\.\d*)$"
else:
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10:
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
# check_docker_logs checks the docker logs by filtering on the docker image name # check_docker_logs checks the docker logs by filtering on the docker image name
async def check_docker_logs(): async def check_docker_logs():
print("\nChecking docker logs...") print("\nChecking docker logs...")