2020-09-04 14:12:20 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2020-09-04 15:07:47 +00:00
|
|
|
import asyncio
|
|
|
|
import io
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import traceback
|
2020-09-04 14:12:20 +00:00
|
|
|
from datetime import datetime, timedelta
|
2020-09-04 15:07:47 +00:00
|
|
|
|
|
|
|
import discord
|
|
|
|
import requests
|
|
|
|
from bot_utils import setup, send_msg
|
2020-09-04 14:12:20 +00:00
|
|
|
|
|
|
|
"""
|
|
|
|
health-checker reads the /health-check endpoint of the portal and dispatches
|
|
|
|
messages to a Discord channel.
|
|
|
|
"""
|
|
|
|
|
2020-09-04 15:13:36 +00:00
|
|
|
# Get the number of hours to look back in the logs or use 1 as default.
|
|
|
|
CHECK_HOURS = 1
|
|
|
|
if len(sys.argv) > 3:
|
|
|
|
CHECK_HOURS = int(sys.argv[3])
|
|
|
|
|
|
|
|
# Discord messages have a limit on their length set at 2000 bytes. We use
|
|
|
|
# a lower limit in order to leave some space for additional message text.
|
2020-09-04 15:07:47 +00:00
|
|
|
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
2020-09-04 14:12:20 +00:00
|
|
|
|
|
|
|
bot_token = setup()
|
|
|
|
client = discord.Client()
|
|
|
|
|
|
|
|
|
|
|
|
# exit_after kills the script if it hasn't exited on its own after `delay` seconds
|
|
|
|
async def exit_after(delay):
|
|
|
|
await asyncio.sleep(delay)
|
|
|
|
os._exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
@client.event
|
|
|
|
async def on_ready():
|
|
|
|
await run_checks()
|
|
|
|
asyncio.create_task(exit_after(3))
|
|
|
|
|
|
|
|
|
|
|
|
async def run_checks():
|
|
|
|
print("Running Skynet portal health checks")
|
|
|
|
try:
|
|
|
|
await check_health()
|
2020-09-04 15:07:47 +00:00
|
|
|
except:
|
2020-09-04 14:12:20 +00:00
|
|
|
trace = traceback.format_exc()
|
|
|
|
print("[DEBUG] run_checks() failed.")
|
2020-09-04 15:07:47 +00:00
|
|
|
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
2020-09-04 14:12:20 +00:00
|
|
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
|
|
|
else:
|
2020-09-04 15:07:47 +00:00
|
|
|
await send_msg(client, "Failed to run the portal health checks!",
|
|
|
|
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
|
|
|
force_notify=True)
|
2020-09-04 14:12:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
# check_health checks /health-check endpoint and reports recent issues
|
|
|
|
async def check_health():
|
2020-09-04 14:44:19 +00:00
|
|
|
print("\nChecking portal health status...")
|
2020-09-04 14:12:20 +00:00
|
|
|
|
|
|
|
try:
|
2020-09-04 14:39:39 +00:00
|
|
|
res = requests.get("http://localhost/health-check", verify=False)
|
2020-09-04 15:07:47 +00:00
|
|
|
except:
|
2020-09-04 14:12:20 +00:00
|
|
|
trace = traceback.format_exc()
|
|
|
|
print("[DEBUG] check_health() failed.")
|
2020-09-04 15:07:47 +00:00
|
|
|
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
2020-09-04 14:12:20 +00:00
|
|
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
|
|
|
else:
|
2020-09-04 15:07:47 +00:00
|
|
|
await send_msg(client, "Failed to run the checks!",
|
|
|
|
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
|
|
|
force_notify=True)
|
2020-09-04 14:12:20 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
# Check the health records.
|
2020-09-04 15:07:47 +00:00
|
|
|
failed_records = []
|
|
|
|
failed_checks = 0
|
|
|
|
failed_critical = 0
|
2020-09-04 14:12:20 +00:00
|
|
|
passed_checks_counter = 0
|
2020-09-04 15:13:36 +00:00
|
|
|
time_limit = datetime.now() - timedelta(hours=CHECK_HOURS)
|
2020-09-04 14:12:20 +00:00
|
|
|
for rec in res.json():
|
|
|
|
time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
|
|
|
|
if time < time_limit:
|
|
|
|
continue
|
2020-09-04 15:07:47 +00:00
|
|
|
bad = False
|
2020-09-04 14:12:20 +00:00
|
|
|
for check in rec['checks']:
|
|
|
|
if check['up'] == False:
|
2020-09-04 15:07:47 +00:00
|
|
|
bad = True
|
|
|
|
failed_checks += 1
|
|
|
|
if check['critical']:
|
|
|
|
failed_critical += 1
|
|
|
|
if bad:
|
|
|
|
# We append the entire record, so we can get the full context.
|
|
|
|
failed_records.append(rec)
|
2020-09-04 14:12:20 +00:00
|
|
|
passed_checks_counter += 1
|
|
|
|
|
2020-09-04 15:07:47 +00:00
|
|
|
if len(failed_records) > 0:
|
|
|
|
message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical,
|
2020-09-04 15:17:26 +00:00
|
|
|
CHECK_HOURS)
|
|
|
|
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log")
|
|
|
|
notifyTeam = failed_critical > 0
|
|
|
|
await send_msg(client, message, file=file, force_notify=notifyTeam)
|
2020-09-04 14:12:20 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
# Send an informational heartbeat if all checks passed.
|
|
|
|
await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter))
|
|
|
|
|
|
|
|
|
|
|
|
client.run(bot_token)
|