From 59a77bfaf6f4cd6781242ef18f9aece51fb03502 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 16:12:20 +0200 Subject: [PATCH] Add a health checker script to Gollum. --- setup-scripts/funds-checker.py | 17 ++-- setup-scripts/health-checker.py | 102 ++++++++++++++++++++ setup-scripts/setup-health-check-scripts.sh | 9 +- 3 files changed, 115 insertions(+), 13 deletions(-) create mode 100755 setup-scripts/health-checker.py diff --git a/setup-scripts/funds-checker.py b/setup-scripts/funds-checker.py index 70d710ed..22a4810d 100755 --- a/setup-scripts/funds-checker.py +++ b/setup-scripts/funds-checker.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -health-checker runs simple health checks on a portal node using the siad API and +funds-checker runs simple checks on a portal node using the siad API and dispatches messages to a Discord channel. """ @@ -21,23 +21,22 @@ async def exit_after(delay): async def on_ready(): await run_checks() asyncio.create_task(exit_after(3)) - await client.close() async def run_checks(): - print("Running Skynet portal health checks") + print("Running Skynet portal funds checks") try: - await check_health() + await check_funds() except: # catch all exceptions trace = traceback.format_exc() await send_msg(client, "```\n{}\n```".format(trace), force_notify=True) -# check_health checks that the wallet is unlocked, that it has at least 1 -# allowance worth of money left, and if more than hald the allowance is spent. If -# all checks pass it sends a informational message. -async def check_health(): +# check_funds checks that the wallet is unlocked, that it has at least 1 +# allowance worth of money left, and if less than half the allowance is spent. +# If all checks pass it sends an informational message. +async def check_funds(): print("\nChecking wallet/funds health...") wallet_get = siad.get_wallet() renter_get = siad.get_renter() @@ -73,7 +72,7 @@ async def check_health(): return # Send an informational heartbeat if all checks passed. - await send_msg(client, "Health checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) + await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) client.run(bot_token) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py new file mode 100755 index 00000000..3047564c --- /dev/null +++ b/setup-scripts/health-checker.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +import discord, requests, traceback, asyncio, os, sys, json, io +from datetime import datetime, timedelta +from bot_utils import setup, send_msg, siad, sc_precision + +""" +health-checker reads the /health-check endpoint of the portal and dispatches +messages to a Discord channel. +""" + +# The default check interval in hours. +DEFAULT_CHECK_INTERVAL = 1 + +bot_token = setup() +client = discord.Client() + + +# get_hostname reads the HOSTNAME from the .env file passed as first argument +# to the script +async def get_hostname(): + if len(sys.argv) > 1: + env_file = sys.argv[1] + with open(env_file, 'r') as file: + for line in file.read().split('\n'): + pair = line.split("=") + if pair[0] == "HOSTNAME": + return pair[1] + await send_msg(client, "HOSTNAME not found, cannot check health status", force_notify=True) + os.exit(0) + + +# exit_after kills the script if it hasn't exited on its own after `delay` seconds +async def exit_after(delay): + await asyncio.sleep(delay) + os._exit(0) + + +@client.event +async def on_ready(): + await run_checks() + asyncio.create_task(exit_after(3)) + + +async def run_checks(): + print("Running Skynet portal health checks") + try: + await check_health() + except: # catch all exceptions + trace = traceback.format_exc() + print("[DEBUG] run_checks() failed.") + if len(trace) < 1900: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + + +# check_health checks /health-check endpoint and reports recent issues +async def check_health(): + print("\nChecking wallet/funds health...") + + try: + hostname = await get_hostname() + res = requests.get("http://"+hostname+"/health-check") + except: # catch all exceptions + trace = traceback.format_exc() + print("[DEBUG] check_health() failed.") + if len(trace) < 1900: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + return + + # Get the number of hours to look back in the logs or use 1 as default. + check_hours = DEFAULT_CHECK_INTERVAL + if len(sys.argv) > 3: + check_hours = int(sys.argv[3]) + + # Check the health records. + failed_checks = [] + passed_checks_counter = 0 + time_limit = datetime.now() - timedelta(hours=check_hours) + for rec in res.json(): + time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') + if time < time_limit: + continue + for check in rec['checks']: + if check['up'] == False: + # We append the entire record, so we can get the full context. + failed_checks.append(rec) + break + passed_checks_counter += 1 + + if len(failed_checks) > 0: + await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True) + return + + # Send an informational heartbeat if all checks passed. + await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter)) + + +client.run(bot_token) diff --git a/setup-scripts/setup-health-check-scripts.sh b/setup-scripts/setup-health-check-scripts.sh index 314a2170..7d17b85b 100755 --- a/setup-scripts/setup-health-check-scripts.sh +++ b/setup-scripts/setup-health-check-scripts.sh @@ -5,11 +5,12 @@ set -e # exit on first error sudo apt-get update sudo apt-get -y install python3-pip -pip3 install discord.py -pip3 install python-dotenv +pip3 install discord.py python-dotenv requests fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env" logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8" +healthCheck="0 * * * * /home/user/skynet-webportal/setup-scripts/health-checker.py /home/user/skynet-webportal/.env sia 1" -(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - -(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$healthCheck" ) | crontab -u user -