Merge pull request #385 from NebulousLabs/ivo/gollum_time_zones

Make the time comparisons in the health checker timezone-aware.
This commit is contained in:
Ivaylo Novakov 2020-09-09 14:46:27 +02:00 committed by GitHub
commit a4dd527a30
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 16 deletions

View File

@ -10,8 +10,10 @@ import traceback
from datetime import datetime, timedelta from datetime import datetime, timedelta
import discord import discord
import pytz.reference
import requests import requests
from bot_utils import setup, send_msg from bot_utils import setup, send_msg
from tzlocal import get_localzone
""" """
health-checker reads the /health-check endpoint of the portal and dispatches health-checker reads the /health-check endpoint of the portal and dispatches
@ -27,11 +29,10 @@ if len(sys.argv) > 3:
# a lower limit in order to leave some space for additional message text. # a lower limit in order to leave some space for additional message text.
DISCORD_MAX_MESSAGE_LENGTH = 1900 DISCORD_MAX_MESSAGE_LENGTH = 1900
GB = 1 << 20 # converts from KiB to GiB GB = 1 << 30 # 1 GiB in bytes
# We are going to issue Discord warnings if the free space on a server falls # We are going to issue Discord warnings if the free space on a server falls
# under this threshold. # under this threshold.
FREE_DISK_SPACE_THRESHOLD = 50 * GB # 50 GiB FREE_DISK_SPACE_THRESHOLD = 50 * GB
bot_token = setup() bot_token = setup()
client = discord.Client() client = discord.Client()
@ -87,8 +88,8 @@ async def check_disk():
volumes = {} volumes = {}
for line in df.split("\n")[1:]: for line in df.split("\n")[1:]:
fields = list(filter(None, line.split(" "))) fields = list(filter(None, line.split(" ")))
# -1 is "mounted on", 3 is "available space" # -1 is "mounted on", 3 is "available space" in KiB which we want in bytes
volumes[fields[-1]] = fields[3] volumes[fields[-1]] = fields[3] * 1024
# List of mount point, longest to shortest. We'll use that to find the best # List of mount point, longest to shortest. We'll use that to find the best
# fit for the volume we want to check. # fit for the volume we want to check.
mount_points = sorted(volumes.keys(), key=len, reverse=True) mount_points = sorted(volumes.keys(), key=len, reverse=True)
@ -103,7 +104,7 @@ async def check_disk():
await send_msg(client, msg) await send_msg(client, msg)
return return
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD: if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
free_space_gb = "{:.2f}".format(int(volumes[vol])/ GB) free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True) await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True)
return return
@ -126,13 +127,15 @@ async def check_health():
return return
# Check the health records. # Check the health records.
failed_records = [] passed_checks = 0
failed_checks = 0 failed_checks = 0
failed_critical = 0 failed_critical = 0
passed_checks_counter = 0 failed_records = []
time_limit = datetime.now() - timedelta(hours=CHECK_HOURS) time_limit_unaware = datetime.now() - timedelta(hours=CHECK_HOURS) # local time
time_limit = time_limit_unaware.astimezone(get_localzone()) # time with time zone
for rec in res.json(): for rec in res.json():
time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') time_unaware = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') # time in UTC
time = pytz.utc.localize(time_unaware) # time with time zone
if time < time_limit: if time < time_limit:
continue continue
bad = False bad = False
@ -145,18 +148,21 @@ async def check_health():
if bad: if bad:
# We append the entire record, so we can get the full context. # We append the entire record, so we can get the full context.
failed_records.append(rec) failed_records.append(rec)
passed_checks_counter += 1 passed_checks += 1
checks = passed_checks + failed_checks
if len(failed_records) > 0: if len(failed_records) > 0:
message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical, message = "Found {}/{} failed checks ({} critical) over the last {} hours!".format(failed_checks, checks,
CHECK_HOURS) failed_critical, CHECK_HOURS)
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log") file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log")
notifyTeam = failed_critical > 0 notifyTeam = failed_critical > 0
await send_msg(client, message, file=file, force_notify=notifyTeam) await send_msg(client, message, file=file, force_notify=notifyTeam)
return return
# Send an informational heartbeat if all checks passed. # Send an informational heartbeat if all checks passed but only if it's in
await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter)) # the first CHECK_HOURS hours of the day, essentially the first call.
if datetime.now().hour < CHECK_HOURS:
await send_msg(client, "Health checks passed: {}/{}\n".format(passed_checks, checks))
client.run(bot_token) client.run(bot_token)

View File

@ -5,7 +5,7 @@ set -e # exit on first error
sudo apt-get update sudo apt-get update
sudo apt-get -y install python3-pip sudo apt-get -y install python3-pip
pip3 install discord.py python-dotenv requests pip3 install discord.py python-dotenv requests pytz tzlocal
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env" fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8" logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"