Merge pull request #372 from NebulousLabs/ivo/gollum_health_checks
Add a health checker script to Gollum.
This commit is contained in:
commit
ff56990faa
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
health-checker runs simple health checks on a portal node using the siad API and
|
funds-checker runs simple checks on a portal node using the siad API and
|
||||||
dispatches messages to a Discord channel.
|
dispatches messages to a Discord channel.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -21,23 +21,22 @@ async def exit_after(delay):
|
||||||
async def on_ready():
|
async def on_ready():
|
||||||
await run_checks()
|
await run_checks()
|
||||||
asyncio.create_task(exit_after(3))
|
asyncio.create_task(exit_after(3))
|
||||||
await client.close()
|
|
||||||
|
|
||||||
|
|
||||||
async def run_checks():
|
async def run_checks():
|
||||||
print("Running Skynet portal health checks")
|
print("Running Skynet portal funds checks")
|
||||||
try:
|
try:
|
||||||
await check_health()
|
await check_funds()
|
||||||
|
|
||||||
except: # catch all exceptions
|
except: # catch all exceptions
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
# check_health checks that the wallet is unlocked, that it has at least 1
|
# check_funds checks that the wallet is unlocked, that it has at least 1
|
||||||
# allowance worth of money left, and if more than hald the allowance is spent. If
|
# allowance worth of money left, and if less than half the allowance is spent.
|
||||||
# all checks pass it sends a informational message.
|
# If all checks pass it sends an informational message.
|
||||||
async def check_health():
|
async def check_funds():
|
||||||
print("\nChecking wallet/funds health...")
|
print("\nChecking wallet/funds health...")
|
||||||
wallet_get = siad.get_wallet()
|
wallet_get = siad.get_wallet()
|
||||||
renter_get = siad.get_renter()
|
renter_get = siad.get_renter()
|
||||||
|
@ -73,7 +72,7 @@ async def check_health():
|
||||||
return
|
return
|
||||||
|
|
||||||
# Send an informational heartbeat if all checks passed.
|
# Send an informational heartbeat if all checks passed.
|
||||||
await send_msg(client, "Health checks passed:\n{} \n{}".format(balance_msg, alloc_msg))
|
await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg))
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
|
@ -0,0 +1,111 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import discord
|
||||||
|
import requests
|
||||||
|
from bot_utils import setup, send_msg
|
||||||
|
|
||||||
|
"""
|
||||||
|
health-checker reads the /health-check endpoint of the portal and dispatches
|
||||||
|
messages to a Discord channel.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get the number of hours to look back in the logs or use 1 as default.
|
||||||
|
CHECK_HOURS = 1
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
CHECK_HOURS = int(sys.argv[3])
|
||||||
|
|
||||||
|
# Discord messages have a limit on their length set at 2000 bytes. We use
|
||||||
|
# a lower limit in order to leave some space for additional message text.
|
||||||
|
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
||||||
|
|
||||||
|
bot_token = setup()
|
||||||
|
client = discord.Client()
|
||||||
|
|
||||||
|
|
||||||
|
# exit_after kills the script if it hasn't exited on its own after `delay` seconds
|
||||||
|
async def exit_after(delay):
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
@client.event
|
||||||
|
async def on_ready():
|
||||||
|
await run_checks()
|
||||||
|
asyncio.create_task(exit_after(3))
|
||||||
|
|
||||||
|
|
||||||
|
async def run_checks():
|
||||||
|
print("Running Skynet portal health checks")
|
||||||
|
try:
|
||||||
|
await check_health()
|
||||||
|
except:
|
||||||
|
trace = traceback.format_exc()
|
||||||
|
print("[DEBUG] run_checks() failed.")
|
||||||
|
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
||||||
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||||
|
else:
|
||||||
|
await send_msg(client, "Failed to run the portal health checks!",
|
||||||
|
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
||||||
|
force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
|
# check_health checks /health-check endpoint and reports recent issues
|
||||||
|
async def check_health():
|
||||||
|
print("\nChecking portal health status...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = requests.get("http://localhost/health-check", verify=False)
|
||||||
|
except:
|
||||||
|
trace = traceback.format_exc()
|
||||||
|
print("[DEBUG] check_health() failed.")
|
||||||
|
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
||||||
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||||
|
else:
|
||||||
|
await send_msg(client, "Failed to run the checks!",
|
||||||
|
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
||||||
|
force_notify=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check the health records.
|
||||||
|
failed_records = []
|
||||||
|
failed_checks = 0
|
||||||
|
failed_critical = 0
|
||||||
|
passed_checks_counter = 0
|
||||||
|
time_limit = datetime.now() - timedelta(hours=CHECK_HOURS)
|
||||||
|
for rec in res.json():
|
||||||
|
time = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
|
||||||
|
if time < time_limit:
|
||||||
|
continue
|
||||||
|
bad = False
|
||||||
|
for check in rec['checks']:
|
||||||
|
if check['up'] == False:
|
||||||
|
bad = True
|
||||||
|
failed_checks += 1
|
||||||
|
if check['critical']:
|
||||||
|
failed_critical += 1
|
||||||
|
if bad:
|
||||||
|
# We append the entire record, so we can get the full context.
|
||||||
|
failed_records.append(rec)
|
||||||
|
passed_checks_counter += 1
|
||||||
|
|
||||||
|
if len(failed_records) > 0:
|
||||||
|
message = "Found {} failed checks ({} critical) over the last {} hours!".format(failed_checks, failed_critical,
|
||||||
|
CHECK_HOURS)
|
||||||
|
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log")
|
||||||
|
notifyTeam = failed_critical > 0
|
||||||
|
await send_msg(client, message, file=file, force_notify=notifyTeam)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Send an informational heartbeat if all checks passed.
|
||||||
|
await send_msg(client, "Health checks passed: {}\n".format(passed_checks_counter))
|
||||||
|
|
||||||
|
|
||||||
|
client.run(bot_token)
|
|
@ -18,8 +18,19 @@ Arguments:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# The default check interval in hours.
|
# Get the container name as an argument or use "sia" as default.
|
||||||
DEFAULT_CHECK_INTERVAL = 1
|
CONTAINER_NAME = "sia"
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
CONTAINER_NAME = sys.argv[2]
|
||||||
|
|
||||||
|
# Get the number of hours to look back in the logs or use 1 as default.
|
||||||
|
CHECK_HOURS = 1
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
CHECK_HOURS = int(sys.argv[3])
|
||||||
|
|
||||||
|
# Discord messages have a limit on their length set at 2000 bytes. We use
|
||||||
|
# a lower limit in order to leave some space for additional message text.
|
||||||
|
DISCORD_MAX_MESSAGE_LENGTH = 1900
|
||||||
|
|
||||||
bot_token = setup()
|
bot_token = setup()
|
||||||
client = discord.Client()
|
client = discord.Client()
|
||||||
|
@ -65,23 +76,13 @@ async def check_load_average():
|
||||||
async def check_docker_logs():
|
async def check_docker_logs():
|
||||||
print("\nChecking docker logs...")
|
print("\nChecking docker logs...")
|
||||||
|
|
||||||
# Get the container name as an argument or use "sia" as default.
|
|
||||||
container_name = "sia"
|
|
||||||
if len(sys.argv) > 2:
|
|
||||||
container_name = sys.argv[2]
|
|
||||||
|
|
||||||
# Get the number of hours to look back in the logs or use 1 as default.
|
|
||||||
check_hours = DEFAULT_CHECK_INTERVAL
|
|
||||||
if len(sys.argv) > 3:
|
|
||||||
check_hours = int(sys.argv[3])
|
|
||||||
|
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
time = now - timedelta(hours=check_hours)
|
time = now - timedelta(hours=CHECK_HOURS)
|
||||||
time_string = "{}h".format(check_hours)
|
time_string = "{}h".format(CHECK_HOURS)
|
||||||
|
|
||||||
# Read the logs.
|
# Read the logs.
|
||||||
print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, container_name))
|
print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME))
|
||||||
proc = Popen(["docker", "logs", "--since", time_string, container_name], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True)
|
proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True)
|
||||||
std_out, std_err = proc.communicate()
|
std_out, std_err = proc.communicate()
|
||||||
|
|
||||||
if len(std_err) > 0:
|
if len(std_err) > 0:
|
||||||
|
@ -90,20 +91,21 @@ async def check_docker_logs():
|
||||||
if len(std_err) > one_mb:
|
if len(std_err) > one_mb:
|
||||||
pos = std_err.find("\n", -one_mb)
|
pos = std_err.find("\n", -one_mb)
|
||||||
std_err = std_err[pos+1:]
|
std_err = std_err[pos+1:]
|
||||||
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
||||||
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
|
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
|
||||||
# Send at most 1900 characters of logs, rounded down to the nearest new line.
|
# Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded
|
||||||
# This is a limitation in the size of Discord messages - they can be at most
|
# down to the nearest new line. This is a limitation in the size of
|
||||||
# 2000 characters long (and we send some extra characters before the error log).
|
# Discord messages - they can be at most 2000 characters long (and we
|
||||||
if len(std_err) > 1900:
|
# send some extra characters before the error log).
|
||||||
pos = std_err.find("\n", -1900)
|
if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
|
||||||
|
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
|
||||||
std_err = std_err[pos+1:]
|
std_err = std_err[pos+1:]
|
||||||
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
|
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
# If there are any critical or severe errors. upload the whole log file.
|
# If there are any critical or severe errors. upload the whole log file.
|
||||||
if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out:
|
if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out:
|
||||||
upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(container_name, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
||||||
await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True)
|
await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,12 @@ set -e # exit on first error
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get -y install python3-pip
|
sudo apt-get -y install python3-pip
|
||||||
|
|
||||||
pip3 install discord.py
|
pip3 install discord.py python-dotenv requests
|
||||||
pip3 install python-dotenv
|
|
||||||
|
|
||||||
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
|
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
|
||||||
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"
|
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"
|
||||||
|
healthCheck="0 * * * * /home/user/skynet-webportal/setup-scripts/health-checker.py /home/user/skynet-webportal/.env sia 1"
|
||||||
|
|
||||||
(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user -
|
(crontab -u user -l; echo "$fundsCheck" ) | crontab -u user -
|
||||||
(crontab -u user -l; echo "$logsCheck" ) | crontab -u user -
|
(crontab -u user -l; echo "$logsCheck" ) | crontab -u user -
|
||||||
|
(crontab -u user -l; echo "$healthCheck" ) | crontab -u user -
|
||||||
|
|
Reference in New Issue