Add free disk space check to health-checker.py.
Move load-average check to health-checker.py.
This commit is contained in:
parent
ff56990faa
commit
0838e4f5e5
|
@ -4,6 +4,7 @@ import asyncio
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
@ -26,6 +27,12 @@ if len(sys.argv) > 3:
|
||||||
# a lower limit in order to leave some space for additional message text.
|
# a lower limit in order to leave some space for additional message text.
|
||||||
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
||||||
|
|
||||||
|
GB = 1 << 20 # converts from KiB to GiB
|
||||||
|
# We are going to issue Discord warnings if the free space on a server falls
|
||||||
|
# under this threshold.
|
||||||
|
FREE_DISK_SPACE_THRESHOLD = 50 * GB # 50 GiB
|
||||||
|
|
||||||
|
|
||||||
bot_token = setup()
|
bot_token = setup()
|
||||||
client = discord.Client()
|
client = discord.Client()
|
||||||
|
|
||||||
|
@ -45,6 +52,8 @@ async def on_ready():
|
||||||
async def run_checks():
|
async def run_checks():
|
||||||
print("Running Skynet portal health checks")
|
print("Running Skynet portal health checks")
|
||||||
try:
|
try:
|
||||||
|
await check_load_average()
|
||||||
|
await check_disk()
|
||||||
await check_health()
|
await check_health()
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
|
@ -57,6 +66,47 @@ async def run_checks():
|
||||||
force_notify=True)
|
force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
|
# check_load_average monitors the system's load average value and issues a
|
||||||
|
# warning message if it exceeds 10.
|
||||||
|
async def check_load_average():
|
||||||
|
uptime_string = os.popen("uptime").read().strip()
|
||||||
|
if sys.platform == "Darwin":
|
||||||
|
pattern = "^.*load averages: \d*\.\d* \d*\.\d* (\d*\.\d*)$"
|
||||||
|
else:
|
||||||
|
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
||||||
|
load_av = re.match(pattern, uptime_string).group(1)
|
||||||
|
if float(load_av) > 10:
|
||||||
|
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
|
# check_disk checks the amount of free space on the /home partition and issues
|
||||||
|
# a warning message if it's under 10GB.
|
||||||
|
async def check_disk():
|
||||||
|
df = os.popen("df --block-size=1024").read().strip()
|
||||||
|
volumes = {}
|
||||||
|
for line in df.split("\n")[1:]:
|
||||||
|
fields = list(filter(None, line.split(" ")))
|
||||||
|
# -1 is "mounted on", 3 is "available space"
|
||||||
|
volumes[fields[-1]] = fields[3]
|
||||||
|
# List of mount point, longest to shortest. We'll use that to find the best
|
||||||
|
# fit for the volume we want to check.
|
||||||
|
mount_points = sorted(volumes.keys(), key=len, reverse=True)
|
||||||
|
wd = os.popen("pwd").read().strip()
|
||||||
|
vol = ""
|
||||||
|
for mp in mount_points:
|
||||||
|
if wd.startswith(mp):
|
||||||
|
vol = mp
|
||||||
|
break
|
||||||
|
if vol == "":
|
||||||
|
msg = "Failed to check free disk space! Didn't find a suitable mount point to check.\ndf output:\n{}".format(df)
|
||||||
|
await send_msg(client, msg)
|
||||||
|
return
|
||||||
|
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
|
||||||
|
free_space_gb = "{:.2f}".format(int(volumes[vol])/ GB)
|
||||||
|
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# check_health checks /health-check endpoint and reports recent issues
|
# check_health checks /health-check endpoint and reports recent issues
|
||||||
async def check_health():
|
async def check_health():
|
||||||
print("\nChecking portal health status...")
|
print("\nChecking portal health status...")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import discord, sys, traceback, io, os, asyncio, re
|
import discord, sys, traceback, io, os, asyncio
|
||||||
from bot_utils import setup, send_msg
|
from bot_utils import setup, send_msg
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
|
@ -51,27 +51,12 @@ async def on_ready():
|
||||||
async def run_checks():
|
async def run_checks():
|
||||||
print("Running Skynet portal log checks")
|
print("Running Skynet portal log checks")
|
||||||
try:
|
try:
|
||||||
await check_load_average()
|
|
||||||
await check_docker_logs()
|
await check_docker_logs()
|
||||||
|
|
||||||
except: # catch all exceptions
|
except: # catch all exceptions
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||||
|
|
||||||
|
|
||||||
# check_load_average monitors the system's load average value and issues a
|
|
||||||
# warning message if it exceeds 10.
|
|
||||||
async def check_load_average():
|
|
||||||
uptime_string = os.popen("uptime").read().strip()
|
|
||||||
# pattern = ""
|
|
||||||
if sys.platform == "Darwin":
|
|
||||||
pattern = "^.*load averages: \d*\.\d* \d*\.\d* (\d*\.\d*)$"
|
|
||||||
else:
|
|
||||||
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
|
||||||
load_av = re.match(pattern, uptime_string).group(1)
|
|
||||||
if float(load_av) > 10:
|
|
||||||
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
|
|
||||||
|
|
||||||
# check_docker_logs checks the docker logs by filtering on the docker image name
|
# check_docker_logs checks the docker logs by filtering on the docker image name
|
||||||
async def check_docker_logs():
|
async def check_docker_logs():
|
||||||
print("\nChecking docker logs...")
|
print("\nChecking docker logs...")
|
||||||
|
|
Reference in New Issue