Move max discord message len to a constant.

Report critical checks failed.
Formatting.
This commit is contained in:
Ivaylo Novakov 2020-09-04 17:07:47 +02:00
parent a0a9137ae7
commit 1cc20903c6
No known key found for this signature in database
GPG Key ID: 06B9354AB08BE9C6
2 changed files with 46 additions and 19 deletions

View File

@ -1,8 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import discord, requests, traceback, asyncio, os, sys, json, io import asyncio
import io
import json
import os
import sys
import traceback
from datetime import datetime, timedelta from datetime import datetime, timedelta
from bot_utils import setup, send_msg, siad, sc_precision
import discord
import requests
from bot_utils import setup, send_msg
""" """
health-checker reads the /health-check endpoint of the portal and dispatches health-checker reads the /health-check endpoint of the portal and dispatches
@ -11,6 +19,7 @@ messages to a Discord channel.
# The default check interval in hours. # The default check interval in hours.
DEFAULT_CHECK_INTERVAL = 1 DEFAULT_CHECK_INTERVAL = 1
DISCORD_MAX_MESSAGE_LENGTH = 1900
bot_token = setup() bot_token = setup()
client = discord.Client() client = discord.Client()
@ -32,13 +41,15 @@ async def run_checks():
print("Running Skynet portal health checks") print("Running Skynet portal health checks")
try: try:
await check_health() await check_health()
except: # catch all exceptions except:
trace = traceback.format_exc() trace = traceback.format_exc()
print("[DEBUG] run_checks() failed.") print("[DEBUG] run_checks() failed.")
if len(trace) < 1900: if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
else: else:
await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) await send_msg(client, "Failed to run the portal health checks!",
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
force_notify=True)
# check_health checks /health-check endpoint and reports recent issues # check_health checks /health-check endpoint and reports recent issues
@ -47,13 +58,15 @@ async def check_health():
try: try:
res = requests.get("http://localhost/health-check", verify=False) res = requests.get("http://localhost/health-check", verify=False)
except: # catch all exceptions except:
trace = traceback.format_exc() trace = traceback.format_exc()
print("[DEBUG] check_health() failed.") print("[DEBUG] check_health() failed.")
if len(trace) < 1900: if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
else: else:
await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True) await send_msg(client, "Failed to run the checks!",
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
force_notify=True)
return return
# Get the number of hours to look back in the logs or use 1 as default. # Get the number of hours to look back in the logs or use 1 as default.
@ -62,22 +75,32 @@ async def check_health():
check_hours = int(sys.argv[3]) check_hours = int(sys.argv[3])
# Check the health records. # Check the health records.
failed_checks = [] failed_records = []
failed_checks = 0
failed_critical = 0
passed_checks_counter = 0 passed_checks_counter = 0
time_limit = datetime.now() - timedelta(hours=check_hours) time_limit = datetime.now() - timedelta(hours=check_hours)
for rec in res.json(): for rec in res.json():
time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
if time < time_limit: if time < time_limit:
continue continue
bad = False
for check in rec['checks']: for check in rec['checks']:
if check['up'] == False: if check['up'] == False:
# We append the entire record, so we can get the full context. bad = True
failed_checks.append(rec) failed_checks += 1
break if check['critical']:
failed_critical += 1
if bad:
# We append the entire record, so we can get the full context.
failed_records.append(rec)
passed_checks_counter += 1 passed_checks_counter += 1
if len(failed_checks) > 0: if len(failed_records) > 0:
await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True) message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical,
check_hours)
await send_msg(client, message, file=discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()),
filename="failed_checks.log"), force_notify=True)
return return
# Send an informational heartbeat if all checks passed. # Send an informational heartbeat if all checks passed.

View File

@ -20,6 +20,9 @@ Arguments:
# The default check interval in hours. # The default check interval in hours.
DEFAULT_CHECK_INTERVAL = 1 DEFAULT_CHECK_INTERVAL = 1
# Discord messages have a limit on their length set at 2000 bytes. We use
# a lower limit in order to leave some space for additional message text.
DISCORD_MAX_MESSAGE_LENGTH = 1900
bot_token = setup() bot_token = setup()
client = discord.Client() client = discord.Client()
@ -92,11 +95,12 @@ async def check_docker_logs():
std_err = std_err[pos+1:] std_err = std_err[pos+1:]
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second) upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second)
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
# Send at most 1900 characters of logs, rounded down to the nearest new line. # Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded
# This is a limitation in the size of Discord messages - they can be at most # down to the nearest new line. This is a limitation in the size of
# 2000 characters long (and we send some extra characters before the error log). # Discord messages - they can be at most 2000 characters long (and we
if len(std_err) > 1900: # send some extra characters before the error log).
pos = std_err.find("\n", -1900) if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
std_err = std_err[pos+1:] std_err = std_err[pos+1:]
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True) await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
return return