From 1cc20903c635bd881738ec8b023e7fd7d459357b Mon Sep 17 00:00:00 2001 From: Ivaylo Novakov Date: Fri, 4 Sep 2020 17:07:47 +0200 Subject: [PATCH] Move max discord message len to a constant. Report critical checks failed. Formatting. --- setup-scripts/health-checker.py | 51 ++++++++++++++++++++++++--------- setup-scripts/log-checker.py | 14 +++++---- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index d5b4c75a..b8df8b71 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -1,8 +1,16 @@ #!/usr/bin/env python3 -import discord, requests, traceback, asyncio, os, sys, json, io +import asyncio +import io +import json +import os +import sys +import traceback from datetime import datetime, timedelta -from bot_utils import setup, send_msg, siad, sc_precision + +import discord +import requests +from bot_utils import setup, send_msg """ health-checker reads the /health-check endpoint of the portal and dispatches @@ -11,6 +19,7 @@ messages to a Discord channel. # The default check interval in hours. DEFAULT_CHECK_INTERVAL = 1 +DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() client = discord.Client() @@ -32,13 +41,15 @@ async def run_checks(): print("Running Skynet portal health checks") try: await check_health() - except: # catch all exceptions + except: trace = traceback.format_exc() print("[DEBUG] run_checks() failed.") - if len(trace) < 1900: + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the portal health checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) # check_health checks /health-check endpoint and reports recent issues @@ -47,13 +58,15 @@ async def check_health(): try: res = requests.get("http://localhost/health-check", verify=False) - except: # catch all exceptions + except: trace = traceback.format_exc() print("[DEBUG] check_health() failed.") - if len(trace) < 1900: + if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) else: - await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) + await send_msg(client, "Failed to run the checks!", + file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), + force_notify=True) return # Get the number of hours to look back in the logs or use 1 as default. @@ -62,22 +75,32 @@ async def check_health(): check_hours = int(sys.argv[3]) # Check the health records. - failed_checks = [] + failed_records = [] + failed_checks = 0 + failed_critical = 0 passed_checks_counter = 0 time_limit = datetime.now() - timedelta(hours=check_hours) for rec in res.json(): time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') if time < time_limit: continue + bad = False for check in rec['checks']: if check['up'] == False: - # We append the entire record, so we can get the full context. - failed_checks.append(rec) - break + bad = True + failed_checks += 1 + if check['critical']: + failed_critical += 1 + if bad: + # We append the entire record, so we can get the full context. + failed_records.append(rec) passed_checks_counter += 1 - if len(failed_checks) > 0: - await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True) + if len(failed_records) > 0: + message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical, + check_hours) + await send_msg(client, message, file=discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), + filename="failed_checks.log"), force_notify=True) return # Send an informational heartbeat if all checks passed. diff --git a/setup-scripts/log-checker.py b/setup-scripts/log-checker.py index 95092201..50363e88 100755 --- a/setup-scripts/log-checker.py +++ b/setup-scripts/log-checker.py @@ -20,6 +20,9 @@ Arguments: # The default check interval in hours. DEFAULT_CHECK_INTERVAL = 1 +# Discord messages have a limit on their length set at 2000 bytes. We use +# a lower limit in order to leave some space for additional message text. +DISCORD_MAX_MESSAGE_LENGTH = 1900 bot_token = setup() client = discord.Client() @@ -92,11 +95,12 @@ async def check_docker_logs(): std_err = std_err[pos+1:] upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) - # Send at most 1900 characters of logs, rounded down to the nearest new line. - # This is a limitation in the size of Discord messages - they can be at most - # 2000 characters long (and we send some extra characters before the error log). - if len(std_err) > 1900: - pos = std_err.find("\n", -1900) + # Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded + # down to the nearest new line. This is a limitation in the size of + # Discord messages - they can be at most 2000 characters long (and we + # send some extra characters before the error log). + if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH: + pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH) std_err = std_err[pos+1:] await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True) return