Move max discord message len to a constant.
Report critical checks failed. Formatting.
This commit is contained in:
parent
a0a9137ae7
commit
1cc20903c6
|
@ -1,8 +1,16 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import discord, requests, traceback, asyncio, os, sys, json, io
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime, timedelta
|
||||
from bot_utils import setup, send_msg, siad, sc_precision
|
||||
|
||||
import discord
|
||||
import requests
|
||||
from bot_utils import setup, send_msg
|
||||
|
||||
"""
|
||||
health-checker reads the /health-check endpoint of the portal and dispatches
|
||||
|
@ -11,6 +19,7 @@ messages to a Discord channel.
|
|||
|
||||
# The default check interval in hours.
|
||||
DEFAULT_CHECK_INTERVAL = 1
|
||||
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
||||
|
||||
bot_token = setup()
|
||||
client = discord.Client()
|
||||
|
@ -32,13 +41,15 @@ async def run_checks():
|
|||
print("Running Skynet portal health checks")
|
||||
try:
|
||||
await check_health()
|
||||
except: # catch all exceptions
|
||||
except:
|
||||
trace = traceback.format_exc()
|
||||
print("[DEBUG] run_checks() failed.")
|
||||
if len(trace) < 1900:
|
||||
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||
else:
|
||||
await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True)
|
||||
await send_msg(client, "Failed to run the portal health checks!",
|
||||
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
||||
force_notify=True)
|
||||
|
||||
|
||||
# check_health checks /health-check endpoint and reports recent issues
|
||||
|
@ -47,13 +58,15 @@ async def check_health():
|
|||
|
||||
try:
|
||||
res = requests.get("http://localhost/health-check", verify=False)
|
||||
except: # catch all exceptions
|
||||
except:
|
||||
trace = traceback.format_exc()
|
||||
print("[DEBUG] check_health() failed.")
|
||||
if len(trace) < 1900:
|
||||
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||
else:
|
||||
await send_msg(client, "Failed to run the checks!", file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True)
|
||||
await send_msg(client, "Failed to run the checks!",
|
||||
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
||||
force_notify=True)
|
||||
return
|
||||
|
||||
# Get the number of hours to look back in the logs or use 1 as default.
|
||||
|
@ -62,22 +75,32 @@ async def check_health():
|
|||
check_hours = int(sys.argv[3])
|
||||
|
||||
# Check the health records.
|
||||
failed_checks = []
|
||||
failed_records = []
|
||||
failed_checks = 0
|
||||
failed_critical = 0
|
||||
passed_checks_counter = 0
|
||||
time_limit = datetime.now() - timedelta(hours=check_hours)
|
||||
for rec in res.json():
|
||||
time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
|
||||
if time < time_limit:
|
||||
continue
|
||||
bad = False
|
||||
for check in rec['checks']:
|
||||
if check['up'] == False:
|
||||
# We append the entire record, so we can get the full context.
|
||||
failed_checks.append(rec)
|
||||
break
|
||||
bad = True
|
||||
failed_checks += 1
|
||||
if check['critical']:
|
||||
failed_critical += 1
|
||||
if bad:
|
||||
# We append the entire record, so we can get the full context.
|
||||
failed_records.append(rec)
|
||||
passed_checks_counter += 1
|
||||
|
||||
if len(failed_checks) > 0:
|
||||
await send_msg(client, "Found {} failed checks over the last {} hours!".format(len(failed_checks), check_hours), file=discord.File(io.BytesIO(json.dumps(failed_checks, indent=2).encode()), filename="failed_checks.log"), force_notify=True)
|
||||
if len(failed_records) > 0:
|
||||
message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical,
|
||||
check_hours)
|
||||
await send_msg(client, message, file=discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()),
|
||||
filename="failed_checks.log"), force_notify=True)
|
||||
return
|
||||
|
||||
# Send an informational heartbeat if all checks passed.
|
||||
|
|
|
@ -20,6 +20,9 @@ Arguments:
|
|||
|
||||
# The default check interval in hours.
|
||||
DEFAULT_CHECK_INTERVAL = 1
|
||||
# Discord messages have a limit on their length set at 2000 bytes. We use
|
||||
# a lower limit in order to leave some space for additional message text.
|
||||
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
||||
|
||||
bot_token = setup()
|
||||
client = discord.Client()
|
||||
|
@ -92,11 +95,12 @@ async def check_docker_logs():
|
|||
std_err = std_err[pos+1:]
|
||||
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
||||
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
|
||||
# Send at most 1900 characters of logs, rounded down to the nearest new line.
|
||||
# This is a limitation in the size of Discord messages - they can be at most
|
||||
# 2000 characters long (and we send some extra characters before the error log).
|
||||
if len(std_err) > 1900:
|
||||
pos = std_err.find("\n", -1900)
|
||||
# Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded
|
||||
# down to the nearest new line. This is a limitation in the size of
|
||||
# Discord messages - they can be at most 2000 characters long (and we
|
||||
# send some extra characters before the error log).
|
||||
if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
|
||||
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
|
||||
std_err = std_err[pos+1:]
|
||||
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
|
||||
return
|
||||
|
|
Reference in New Issue