diff --git a/setup-scripts/funds-checker.py b/setup-scripts/funds-checker.py index 70d710ed..22a4810d 100755 --- a/setup-scripts/funds-checker.py +++ b/setup-scripts/funds-checker.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -health-checker runs simple health checks on a portal node using the siad API and +funds-checker runs simple checks on a portal node using the siad API and dispatches messages to a Discord channel. """ @@ -21,23 +21,22 @@ async def exit_after(delay): async def on_ready(): await run_checks() asyncio.create_task(exit_after(3)) - await client.close() async def run_checks(): - print("Running Skynet portal health checks") + print("Running Skynet portal funds checks") try: - await check_health() + await check_funds() except: # catch all exceptions trace = traceback.format_exc() await send_msg(client, "```\n{}\n```".format(trace), force_notify=True) -# check_health checks that the wallet is unlocked, that it has at least 1 -# allowance worth of money left, and if more than hald the allowance is spent. If -# all checks pass it sends a informational message. -async def check_health(): +# check_funds checks that the wallet is unlocked, that it has at least 1 +# allowance worth of money left, and if less than half the allowance is spent. +# If all checks pass it sends an informational message. +async def check_funds(): print("\nChecking wallet/funds health...") wallet_get = siad.get_wallet() renter_get = siad.get_renter() @@ -73,7 +72,7 @@ async def check_health(): return # Send an informational heartbeat if all checks passed. - await send_msg(client, "Health checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) + await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) client.run(bot_token) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py new file mode 100755 index 00000000..6322f894 --- /dev/null +++ b/setup-scripts/health-checker.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +import asyncio +import io +import json +import os +import sys +import traceback +from datetime import datetime, timedelta + +import discord +import requests +from bot_utils import setup, send_msg + +""" +health-checker reads the /health-check endpoint of the portal and dispatches +messages to a Discord channel. +""" + +# Get the number of hours to look back in the logs or use 1 as default. +CHECK_HOURS = 1 +if len(sys.argv) > 3: + CHECK_HOURS = int(sys.argv[3]) + +# Discord messages have a limit on their length set at 2000 bytes. We use +# a lower limit in order to leave some space for additional message text. +DISCORD_MAX_MESSAGE_LENGTH = 1900 + +bot_token = setup() +client = discord.Client() + + +# exit_after kills the script if it hasn't exited on its own after `delay` seconds +async def exit_after(delay): + await asyncio.sleep(delay) + os._exit(0) + + +@client.event +async def on_ready(): + await run_checks() + asyncio.create_task(exit_after(3)) + + +async def run_checks(): + print("Running Skynet portal health checks") + try: + await check_health() + except: + trace = traceback.format_exc() + print("[DEBUG] run_checks() failed.") + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the portal health checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) + + +# check_health checks /health-check endpoint and reports recent issues +async def check_health(): + print("\nChecking portal health status...") + + try: + res = requests.get("http://localhost/health-check", verify=False) + except: + trace = traceback.format_exc() + print("[DEBUG] check_health() failed.") + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) + return + + # Check the health records. + failed_records = [] + failed_checks = 0 + failed_critical = 0 + passed_checks_counter = 0 + time_limit = datetime.now() - timedelta(hours=CHECK_HOURS) + for rec in res.json(): + time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') + if time < time_limit: + continue + bad = False + for check in rec['checks']: + if check['up'] == False: + bad = True + failed_checks += 1 + if check['critical']: + failed_critical += 1 + if bad: + # We append the entire record, so we can get the full context. + failed_records.append(rec) + passed_checks_counter += 1 + + if len(failed_records) > 0: + message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical, + CHECK_HOURS) + file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log") + notifyTeam = failed_critical > 0 + await send_msg(client, message, file=file, force_notify=notifyTeam) + return + + # Send an informational heartbeat if all checks passed. + await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter)) + + +client.run(bot_token) diff --git a/setup-scripts/log-checker.py b/setup-scripts/log-checker.py index 95092201..029b9a07 100755 --- a/setup-scripts/log-checker.py +++ b/setup-scripts/log-checker.py @@ -18,8 +18,19 @@ Arguments: """ -# The default check interval in hours. -DEFAULT_CHECK_INTERVAL = 1 +# Get the container name as an argument or use "sia" as default. +CONTAINER_NAME = "sia" +if len(sys.argv) > 2: + CONTAINER_NAME = sys.argv[2] + +# Get the number of hours to look back in the logs or use 1 as default. +CHECK_HOURS = 1 +if len(sys.argv) > 3: + CHECK_HOURS = int(sys.argv[3]) + +# Discord messages have a limit on their length set at 2000 bytes. We use +# a lower limit in order to leave some space for additional message text. +DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() client = discord.Client() @@ -65,23 +76,13 @@ async def check_load_average(): async def check_docker_logs(): print("\nChecking docker logs...") - # Get the container name as an argument or use "sia" as default. - container_name = "sia" - if len(sys.argv) > 2: - container_name = sys.argv[2] - - # Get the number of hours to look back in the logs or use 1 as default. - check_hours = DEFAULT_CHECK_INTERVAL - if len(sys.argv) > 3: - check_hours = int(sys.argv[3]) - now = datetime.now() - time = now - timedelta(hours=check_hours) - time_string = "{}h".format(check_hours) + time = now - timedelta(hours=CHECK_HOURS) + time_string = "{}h".format(CHECK_HOURS) # Read the logs. - print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, container_name)) - proc = Popen(["docker", "logs", "--since", time_string, container_name], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) + print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME)) + proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) std_out, std_err = proc.communicate() if len(std_err) > 0: @@ -90,20 +91,21 @@ async def check_docker_logs(): if len(std_err) > one_mb: pos = std_err.find("\n", -one_mb) std_err = std_err[pos+1:] - upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) + upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) - # Send at most 1900 characters of logs, rounded down to the nearest new line. - # This is a limitation in the size of Discord messages - they can be at most - # 2000 characters long (and we send some extra characters before the error log). - if len(std_err) > 1900: - pos = std_err.find("\n", -1900) + # Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded + # down to the nearest new line. This is a limitation in the size of + # Discord messages - they can be at most 2000 characters long (and we + # send some extra characters before the error log). + if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH: + pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH) std_err = std_err[pos+1:] await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True) return # If there are any critical or severe errors. upload the whole log file. if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out: - upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) + upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True) return diff --git a/setup-scripts/setup-health-check-scripts.sh b/setup-scripts/setup-health-check-scripts.sh index 314a2170..7d17b85b 100755 --- a/setup-scripts/setup-health-check-scripts.sh +++ b/setup-scripts/setup-health-check-scripts.sh @@ -5,11 +5,12 @@ set -e # exit on first error sudo apt-get update sudo apt-get -y install python3-pip -pip3 install discord.py -pip3 install python-dotenv +pip3 install discord.py python-dotenv requests fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env" logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8" +healthCheck="0 * * * * /home/user/skynet-webportal/setup-scripts/health-checker.py /home/user/skynet-webportal/.env sia 1" -(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - -(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$healthCheck" ) | crontab -u user -