From 59a77bfaf6f4cd6781242ef18f9aece51fb03502 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 16:12:20 +0200 Subject: [PATCH 1/6] Add a health checker script to Gollum. --- setup-scripts/funds-checker.py | 17 ++-- setup-scripts/health-checker.py | 102 ++++++++++++++++++++ setup-scripts/setup-health-check-scripts.sh | 9 +- 3 files changed, 115 insertions(+), 13 deletions(-) create mode 100755 setup-scripts/health-checker.py diff --git a/setup-scripts/funds-checker.py b/setup-scripts/funds-checker.py index 70d710ed..22a4810d 100755 --- a/setup-scripts/funds-checker.py +++ b/setup-scripts/funds-checker.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -health-checker runs simple health checks on a portal node using the siad API and +funds-checker runs simple checks on a portal node using the siad API and dispatches messages to a Discord channel. """ @@ -21,23 +21,22 @@ async def exit_after(delay): async def on_ready(): await run_checks() asyncio.create_task(exit_after(3)) - await client.close() async def run_checks(): - print("Running Skynet portal health checks") + print("Running Skynet portal funds checks") try: - await check_health() + await check_funds() except: # catch all exceptions trace = traceback.format_exc() await send_msg(client, "```\n{}\n```".format(trace), force_notify=True) -# check_health checks that the wallet is unlocked, that it has at least 1 -# allowance worth of money left, and if more than hald the allowance is spent. If -# all checks pass it sends a informational message. -async def check_health(): +# check_funds checks that the wallet is unlocked, that it has at least 1 +# allowance worth of money left, and if less than half the allowance is spent. +# If all checks pass it sends an informational message. +async def check_funds(): print("\nChecking wallet/funds health...") wallet_get = siad.get_wallet() renter_get = siad.get_renter() @@ -73,7 +72,7 @@ async def check_health(): return # Send an informational heartbeat if all checks passed. - await send_msg(client, "Health checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) + await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) client.run(bot_token) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py new file mode 100755 index 00000000..3047564c --- /dev/null +++ b/setup-scripts/health-checker.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +import discord, requests, traceback, asyncio, os, sys, json, io +from datetime import datetime, timedelta +from bot_utils import setup, send_msg, siad, sc_precision + +""" +health-checker reads the /health-check endpoint of the portal and dispatches +messages to a Discord channel. +""" + +# The default check interval in hours. +DEFAULT_CHECK_INTERVAL = 1 + +bot_token = setup() +client = discord.Client() + + +# get_hostname reads the HOSTNAME from the .env file passed as first argument +# to the script +async def get_hostname(): + if len(sys.argv) > 1: + env_file = sys.argv[1] + with open(env_file, 'r') as file: + for line in file.read().split('\n'): + pair = line.split("=") + if pair[0] == "HOSTNAME": + return pair[1] + await send_msg(client, "HOSTNAME not found, cannot check health status", force_notify=True) + os.exit(0) + + +# exit_after kills the script if it hasn't exited on its own after `delay` seconds +async def exit_after(delay): + await asyncio.sleep(delay) + os._exit(0) + + +@client.event +async def on_ready(): + await run_checks() + asyncio.create_task(exit_after(3)) + + +async def run_checks(): + print("Running Skynet portal health checks") + try: + await check_health() + except: # catch all exceptions + trace = traceback.format_exc() + print("[DEBUG] run_checks() failed.") + if len(trace) < 1900: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + + +# check_health checks /health-check endpoint and reports recent issues +async def check_health(): + print("\nChecking wallet/funds health...") + + try: + hostname = await get_hostname() + res = requests.get("http://"+hostname+"/health-check") + except: # catch all exceptions + trace = traceback.format_exc() + print("[DEBUG] check_health() failed.") + if len(trace) < 1900: + await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) + else: + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + return + + # Get the number of hours to look back in the logs or use 1 as default. + check_hours = DEFAULT_CHECK_INTERVAL + if len(sys.argv) > 3: + check_hours = int(sys.argv[3]) + + # Check the health records. + failed_checks = [] + passed_checks_counter = 0 + time_limit = datetime.now() - timedelta(hours=check_hours) + for rec in res.json(): + time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') + if time < time_limit: + continue + for check in rec['checks']: + if check['up'] == False: + # We append the entire record, so we can get the full context. + failed_checks.append(rec) + break + passed_checks_counter += 1 + + if len(failed_checks) > 0: + await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True) + return + + # Send an informational heartbeat if all checks passed. + await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter)) + + +client.run(bot_token) diff --git a/setup-scripts/setup-health-check-scripts.sh b/setup-scripts/setup-health-check-scripts.sh index 314a2170..7d17b85b 100755 --- a/setup-scripts/setup-health-check-scripts.sh +++ b/setup-scripts/setup-health-check-scripts.sh @@ -5,11 +5,12 @@ set -e # exit on first error sudo apt-get update sudo apt-get -y install python3-pip -pip3 install discord.py -pip3 install python-dotenv +pip3 install discord.py python-dotenv requests fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env" logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8" +healthCheck="0 * * * * /home/user/skynet-webportal/setup-scripts/health-checker.py /home/user/skynet-webportal/.env sia 1" -(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - -(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$logsCheck" ) | crontab -u user - +(crontab -u user -l; echo "$healthCheck" ) | crontab -u user - From 62e27120cdc96cfbb6e156bc3aa281d25db8b9b7 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 16:39:39 +0200 Subject: [PATCH 2/6] Use localhost. --- setup-scripts/health-checker.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index 3047564c..039c9410 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -16,20 +16,6 @@ bot_token = setup() client = discord.Client() -# get_hostname reads the HOSTNAME from the .env file passed as first argument -# to the script -async def get_hostname(): - if len(sys.argv) > 1: - env_file = sys.argv[1] - with open(env_file, 'r') as file: - for line in file.read().split('\n'): - pair = line.split("=") - if pair[0] == "HOSTNAME": - return pair[1] - await send_msg(client, "HOSTNAME not found, cannot check health status", force_notify=True) - os.exit(0) - - # exit_after kills the script if it hasn't exited on its own after `delay` seconds async def exit_after(delay): await asyncio.sleep(delay) @@ -52,7 +38,7 @@ async def run_checks(): if len(trace) < 1900: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) # check_health checks /health-check endpoint and reports recent issues @@ -60,15 +46,14 @@ async def check_health(): print("\nChecking wallet/funds health...") try: - hostname = await get_hostname() - res = requests.get("http://"+hostname+"/health-check") + res = requests.get("http://localhost/health-check", verify=False) except: # catch all exceptions trace = traceback.format_exc() print("[DEBUG] check_health() failed.") if len(trace) < 1900: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) return # Get the number of hours to look back in the logs or use 1 as default. From a0a9137ae70b3718c904dba7b3513f705af2f7e9 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 16:44:19 +0200 Subject: [PATCH 3/6] Update setup-scripts/health-checker.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Karol Wypchło --- setup-scripts/health-checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index 039c9410..d5b4c75a 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -43,7 +43,7 @@ async def run_checks(): # check_health checks /health-check endpoint and reports recent issues async def check_health(): - print("\nChecking wallet/funds health...") + print("\nChecking portal health status...") try: res = requests.get("http://localhost/health-check", verify=False) From 1cc20903c635bd881738ec8b023e7fd7d459357b Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 17:07:47 +0200 Subject: [PATCH 4/6] Move max discord message len to a constant. Report critical checks failed. Formatting. --- setup-scripts/health-checker.py | 51 ++++++++++++++++++++++++--------- setup-scripts/log-checker.py | 14 +++++---- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index d5b4c75a..b8df8b71 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -1,8 +1,16 @@ #!/usr/bin/env python3 -import discord, requests, traceback, asyncio, os, sys, json, io +import asyncio +import io +import json +import os +import sys +import traceback from datetime import datetime, timedelta -from bot_utils import setup, send_msg, siad, sc_precision + +import discord +import requests +from bot_utils import setup, send_msg """ health-checker reads the /health-check endpoint of the portal and dispatches @@ -11,6 +19,7 @@ messages to a Discord channel. # The default check interval in hours. DEFAULT_CHECK_INTERVAL = 1 +DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() client = discord.Client() @@ -32,13 +41,15 @@ async def run_checks(): print("Running Skynet portal health checks") try: await check_health() - except: # catch all exceptions + except: trace = traceback.format_exc() print("[DEBUG] run_checks() failed.") - if len(trace) < 1900: + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the portal health checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) # check_health checks /health-check endpoint and reports recent issues @@ -47,13 +58,15 @@ async def check_health(): try: res = requests.get("http://localhost/health-check", verify=False) - except: # catch all exceptions + except: trace = traceback.format_exc() print("[DEBUG] check_health() failed.") - if len(trace) < 1900: + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) return # Get the number of hours to look back in the logs or use 1 as default. @@ -62,22 +75,32 @@ async def check_health(): check_hours = int(sys.argv[3]) # Check the health records. - failed_checks = [] + failed_records = [] + failed_checks = 0 + failed_critical = 0 passed_checks_counter = 0 time_limit = datetime.now() - timedelta(hours=check_hours) for rec in res.json(): time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') if time < time_limit: continue + bad = False for check in rec['checks']: if check['up'] == False: - # We append the entire record, so we can get the full context. - failed_checks.append(rec) - break + bad = True + failed_checks += 1 + if check['critical']: + failed_critical += 1 + if bad: + # We append the entire record, so we can get the full context. + failed_records.append(rec) passed_checks_counter += 1 - if len(failed_checks) > 0: - await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True) + if len(failed_records) > 0: + message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical, + check_hours) + await send_msg(client, message, file=discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), + filename="failed_checks.log"), force_notify=True) return # Send an informational heartbeat if all checks passed. diff --git a/setup-scripts/log-checker.py b/setup-scripts/log-checker.py index 95092201..50363e88 100755 --- a/setup-scripts/log-checker.py +++ b/setup-scripts/log-checker.py @@ -20,6 +20,9 @@ Arguments: # The default check interval in hours. DEFAULT_CHECK_INTERVAL = 1 +# Discord messages have a limit on their length set at 2000 bytes. We use +# a lower limit in order to leave some space for additional message text. +DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() client = discord.Client() @@ -92,11 +95,12 @@ async def check_docker_logs(): std_err = std_err[pos+1:] upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) - # Send at most 1900 characters of logs, rounded down to the nearest new line. - # This is a limitation in the size of Discord messages - they can be at most - # 2000 characters long (and we send some extra characters before the error log). - if len(std_err) > 1900: - pos = std_err.find("\n", -1900) + # Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded + # down to the nearest new line. This is a limitation in the size of + # Discord messages - they can be at most 2000 characters long (and we + # send some extra characters before the error log). + if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH: + pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH) std_err = std_err[pos+1:] await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True) return From 5eece67b03b022067ee80e47e0fb6b44f06badc1 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 17:13:36 +0200 Subject: [PATCH 5/6] Move parameter parsing to the top of the script. --- setup-scripts/health-checker.py | 16 ++++++++-------- setup-scripts/log-checker.py | 34 ++++++++++++++++----------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index b8df8b71..f28d8469 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -17,8 +17,13 @@ health-checker reads the /health-check endpoint of the portal and dispatches messages to a Discord channel. """ -# The default check interval in hours. -DEFAULT_CHECK_INTERVAL = 1 +# Get the number of hours to look back in the logs or use 1 as default. +CHECK_HOURS = 1 +if len(sys.argv) > 3: + CHECK_HOURS = int(sys.argv[3]) + +# Discord messages have a limit on their length set at 2000 bytes. We use +# a lower limit in order to leave some space for additional message text. DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() @@ -69,17 +74,12 @@ async def check_health(): force_notify=True) return - # Get the number of hours to look back in the logs or use 1 as default. - check_hours = DEFAULT_CHECK_INTERVAL - if len(sys.argv) > 3: - check_hours = int(sys.argv[3]) - # Check the health records. failed_records = [] failed_checks = 0 failed_critical = 0 passed_checks_counter = 0 - time_limit = datetime.now() - timedelta(hours=check_hours) + time_limit = datetime.now() - timedelta(hours=CHECK_HOURS) for rec in res.json(): time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') if time < time_limit: diff --git a/setup-scripts/log-checker.py b/setup-scripts/log-checker.py index 50363e88..029b9a07 100755 --- a/setup-scripts/log-checker.py +++ b/setup-scripts/log-checker.py @@ -18,8 +18,16 @@ Arguments: """ -# The default check interval in hours. -DEFAULT_CHECK_INTERVAL = 1 +# Get the container name as an argument or use "sia" as default. +CONTAINER_NAME = "sia" +if len(sys.argv) > 2: + CONTAINER_NAME = sys.argv[2] + +# Get the number of hours to look back in the logs or use 1 as default. +CHECK_HOURS = 1 +if len(sys.argv) > 3: + CHECK_HOURS = int(sys.argv[3]) + # Discord messages have a limit on their length set at 2000 bytes. We use # a lower limit in order to leave some space for additional message text. DISCORD_MAX_MESSAGE_LENGTH = 1900 @@ -68,23 +76,13 @@ async def check_load_average(): async def check_docker_logs(): print("\nChecking docker logs...") - # Get the container name as an argument or use "sia" as default. - container_name = "sia" - if len(sys.argv) > 2: - container_name = sys.argv[2] - - # Get the number of hours to look back in the logs or use 1 as default. - check_hours = DEFAULT_CHECK_INTERVAL - if len(sys.argv) > 3: - check_hours = int(sys.argv[3]) - now = datetime.now() - time = now - timedelta(hours=check_hours) - time_string = "{}h".format(check_hours) + time = now - timedelta(hours=CHECK_HOURS) + time_string = "{}h".format(CHECK_HOURS) # Read the logs. - print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, container_name)) - proc = Popen(["docker", "logs", "--since", time_string, container_name], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) + print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME)) + proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) std_out, std_err = proc.communicate() if len(std_err) > 0: @@ -93,7 +91,7 @@ async def check_docker_logs(): if len(std_err) > one_mb: pos = std_err.find("\n", -one_mb) std_err = std_err[pos+1:] - upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) + upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) # Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded # down to the nearest new line. This is a limitation in the size of @@ -107,7 +105,7 @@ async def check_docker_logs(): # If there are any critical or severe errors. upload the whole log file. if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out: - upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) + upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True) return From 3f4742a43668bb26ba6afec647f524b801da4559 Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 17:17:26 +0200 Subject: [PATCH 6/6] Only notify the team if critical checks have failed. --- setup-scripts/health-checker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index f28d8469..6322f894 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -98,9 +98,10 @@ async def check_health(): if len(failed_records) > 0: message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical, - check_hours) - await send_msg(client, message, file=discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), - filename="failed_checks.log"), force_notify=True) + CHECK_HOURS) + file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log") + notifyTeam = failed_critical > 0 + await send_msg(client, message, file=file, force_notify=notifyTeam) return # Send an informational heartbeat if all checks passed.