reimplement health checks (#434)

This commit is contained in:
Karol Wypchło 2020-09-29 12:32:45 +02:00 committed by GitHub
parent 5e9f88bfce
commit 10a251c081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 210 additions and 123 deletions

View File

@ -3,8 +3,9 @@
from urllib.request import urlopen, Request
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime
import urllib, json, os, traceback, discord, sys
import urllib, json, os, traceback, discord, sys, re, subprocess, requests, io
# sc_precision is the number of hastings per siacoin
sc_precision = 10 ** 24
@ -17,6 +18,22 @@ api_endpoint, port, portal_name, bot_token, password = None, None, None, None, N
discord_client = None
setup_done = False
# Get the container name as an argument or use "sia" as default.
CONTAINER_NAME = "sia"
if len(sys.argv) > 2:
CONTAINER_NAME = sys.argv[2]
# find out local siad ip by inspecting its docker container
def get_api_ip():
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
docker_cmd = (
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
+ CONTAINER_NAME
)
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
return ip_regex.findall(output)[0]
def setup():
# Load dotenv file if possible.
# TODO: change all scripts to use named flags/params
@ -37,7 +54,7 @@ def setup():
port = "9980"
global api_endpoint
api_endpoint = "http://localhost:{}".format(port)
api_endpoint = "http://{}:{}".format(get_api_ip(), port)
siad.initialize()
@ -46,6 +63,7 @@ def setup():
return bot_token
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here".
async def send_msg(client, msg, force_notify=False, file=None):
await client.wait_until_ready()
@ -69,14 +87,48 @@ async def send_msg(client, msg, force_notify=False, file=None):
break
# Add the portal name.
msg = "`{}`: {}".format(portal_name, msg)
msg = "**{}**: {}".format(portal_name, msg)
if isinstance(file, str):
is_json = is_json_string(file)
content_type = "application/json" if is_json else "text/plain"
ext = "json" if is_json else "txt"
filename = "{}-{}.{}".format(
CONTAINER_NAME, datetime.utcnow().strftime("%Y-%m-%d-%H:%M:%S"), ext
)
skylink = upload_to_skynet(file, filename, content_type=content_type)
if skylink:
msg = "{} {}".format(msg, skylink) # append skylink to message
file = None # clean file reference, we're using a skylink
else:
file = discord.File(
io.BytesIO(file.encode()), filename=filename
) # wrap text into discord file wrapper
if force_notify:
msg = "{}: \n{}".format(role.mention, msg)
msg = "{} /cc {}".format(msg, role.mention)
await chan.send(msg, file=file)
#siad class provides wrappers for the necessary siad commands.
def upload_to_skynet(contents, filename="file.txt", content_type="text/plain"):
files = {"file": (filename, contents, content_type)}
res = requests.post("https://siasky.net/skynet/skyfile", files=files)
if res.status_code == requests.codes["ok"]:
res_json = res.json()
return "https://siasky.net/" + res_json["skylink"]
return None
def is_json_string(str):
try:
json.loads(str)
return True
except ValueError:
return False
# siad class provides wrappers for the necessary siad commands.
class siad:
# initializes values for using the API (password and
# user-agent) so that all calls to urllib.request.urlopen have these set.
@ -90,7 +142,7 @@ class siad:
# Setup an opener with the correct user agent
opener = urllib.request.build_opener(handler)
opener.addheaders = [('User-agent', 'Sia-Agent')]
opener.addheaders = [("User-agent", "Sia-Agent")]
# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
@ -102,7 +154,7 @@ class siad:
password = os.getenv("SIA_API_PASSWORD")
if not password:
home = os.getenv("HOME")
password_file = open(home+"/.sia/apipassword")
password_file = open(home + "/.sia/apipassword")
password = password_file.readlines()[0].strip()
return password
@ -111,26 +163,26 @@ class siad:
def load_json(resp):
return json.loads(resp.decode("utf-8"))
@staticmethod
def get_wallet():
if not setup_done: setup()
if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/wallet").read()
return siad.load_json(resp)
@staticmethod
def get_renter():
if not setup_done: setup()
if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/renter").read()
return siad.load_json(resp)
@staticmethod
def get_renter_contracts():
if not setup_done: setup()
if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/renter/contracts").read()
return siad.load_json(resp)

View File

@ -27,7 +27,6 @@ async def run_checks():
print("Running Skynet portal funds checks")
try:
await check_funds()
except: # catch all exceptions
trace = traceback.format_exc()
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
@ -41,38 +40,43 @@ async def check_funds():
wallet_get = siad.get_wallet()
renter_get = siad.get_renter()
if not wallet_get['unlocked']:
if not wallet_get["unlocked"]:
await send_msg(client, "Wallet locked", force_notify=True)
return
confirmed_coins = int(wallet_get['confirmedsiacoinbalance'])
unconfirmed_coins = int(wallet_get['unconfirmedincomingsiacoins'])
unconfirmed_outgoing_coins = int(wallet_get['unconfirmedoutgoingsiacoins'])
confirmed_coins = int(wallet_get["confirmedsiacoinbalance"])
unconfirmed_coins = int(wallet_get["unconfirmedincomingsiacoins"])
unconfirmed_outgoing_coins = int(wallet_get["unconfirmedoutgoingsiacoins"])
balance = confirmed_coins + unconfirmed_coins - unconfirmed_outgoing_coins
print("Balance: ", balance / sc_precision)
allowance = renter_get['settings']['allowance']
allowance_funds = int(allowance['funds'])
allocated_funds = int(renter_get['financialmetrics']['totalallocated'])
allowance = renter_get["settings"]["allowance"]
allowance_funds = int(allowance["funds"])
allocated_funds = int(renter_get["financialmetrics"]["totalallocated"])
unallocated_funds = allowance_funds - allocated_funds
balance_msg = "Balance: `{} SC` Allowance Funds: `{} SC`".format(round(balance/sc_precision), round(allowance_funds/sc_precision))
alloc_msg = "Unallocated: `{} SC`\nAllocated: `{} SC`".format(round(unallocated_funds/sc_precision), round(allocated_funds/sc_precision))
balance_msg = "Balance: {} SC, Allowance Funds: {} SC".format(
round(balance / sc_precision), round(allowance_funds / sc_precision)
)
alloc_msg = "Unallocated: {} SC, Allocated: {} SC".format(
round(unallocated_funds / sc_precision), round(allocated_funds / sc_precision)
)
# Send an alert if there is less than 1 allowance worth of money left.
if balance < allowance_funds:
await send_msg(client, "Wallet balance running low. \n{}".format(balance_msg), force_notify=True)
return
message = "__Wallet balance running low!__ {}".format(balance_msg)
return await send_msg(client, message, force_notify=True)
# Alert devs when only a fraction of the allowance is remaining.
SPEND_THRESHOLD = 0.8
if allocated_funds >= SPEND_THRESHOLD * allowance_funds :
await send_msg(client, "More than {:.0%} of allowance spent: \n{}".format(SPEND_THRESHOLD, alloc_msg), force_notify=True)
return
if allocated_funds >= SPEND_THRESHOLD * allowance_funds:
message = "__More than {:.0%} of allowance spent!__ {}".format(
SPEND_THRESHOLD, alloc_msg
)
return await send_msg(client, message, force_notify=True)
# Send an informational heartbeat if all checks passed.
await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg))
await send_msg(client, "Funds checks passed. {} {}".format(balance_msg, alloc_msg))
client.run(bot_token)

View File

@ -1,19 +1,8 @@
#!/usr/bin/env python3
import asyncio
import io
import json
import os
import re
import sys
import traceback
import asyncio, json, os, re, sys, traceback, discord, requests
from datetime import datetime, timedelta
import discord
import pytz.reference
import requests
from bot_utils import setup, send_msg
from tzlocal import get_localzone
"""
health-checker reads the /health-check endpoint of the portal and dispatches
@ -55,19 +44,19 @@ async def run_checks():
try:
await check_load_average()
await check_disk()
# await check_health() # FIXME: adjust it to work with https://github.com/NebulousLabs/skynet-webportal/pull/389
await check_health()
except:
trace = traceback.format_exc()
print("[DEBUG] run_checks() failed.")
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
else:
await send_msg(client, "Failed to run the portal health checks!",
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
force_notify=True)
await send_msg(
client,
"Failed to run the portal health checks!",
file=trace,
force_notify=True,
)
# check_load_average monitors the system's load average value and issues a
# check_load_average monitors the system load average value and issues a
# warning message if it exceeds 10.
async def check_load_average():
uptime_string = os.popen("uptime").read().strip()
@ -77,7 +66,8 @@ async def check_load_average():
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10:
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
message = "High system load detected in uptime output: {}".format(uptime_string)
await send_msg(client, message, force_notify=True)
# check_disk checks the amount of free space on the /home partition and issues
@ -100,13 +90,12 @@ async def check_disk():
vol = mp
break
if vol == "":
msg = "Failed to check free disk space! Didn't find a suitable mount point to check.\ndf output:\n{}".format(df)
await send_msg(client, msg)
return
message = "Failed to check free disk space! Didn't find a suitable mount point to check."
return await send_msg(client, message, file=df)
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True)
return
message = "WARNING! Low disk space: {}GiB".format(free_space_gb)
return await send_msg(client, message, force_notify=True)
# check_health checks /health-check endpoint and reports recent issues
@ -114,55 +103,94 @@ async def check_health():
print("\nChecking portal health status...")
try:
res = requests.get("http://localhost/health-check", verify=False)
res_check = requests.get("http://localhost/health-check", verify=False)
json_check = res_check.json()
json_critical = requests.get(
"http://localhost/health-check/critical", verify=False
).json()
json_verbose = requests.get(
"http://localhost/health-check/verbose", verify=False
).json()
except:
trace = traceback.format_exc()
print("[DEBUG] check_health() failed.")
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
else:
await send_msg(client, "Failed to run the checks!",
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
force_notify=True)
return
return await send_msg(
client, "Failed to run the checks!", file=trace, force_notify=True
)
critical_checks_total = 0
critical_checks_failed = 0
verbose_checks_total = 0
verbose_checks_failed = 0
# Check the health records.
passed_checks = 0
failed_checks = 0
failed_critical = 0
failed_records = []
time_limit_unaware = datetime.now() - timedelta(hours=CHECK_HOURS) # local time
time_limit = time_limit_unaware.astimezone(get_localzone()) # time with time zone
for rec in res.json():
time_unaware = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') # time in UTC
time = pytz.utc.localize(time_unaware) # time with time zone
failed_records_file = None
time_limit = datetime.utcnow() - timedelta(hours=CHECK_HOURS)
for critical in json_critical:
time = datetime.strptime(critical["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
if time < time_limit:
continue
bad = False
for check in rec['checks']:
if check['up'] == False:
for check in critical["checks"]:
critical_checks_total += 1
if check["up"] == False:
critical_checks_failed += 1
bad = True
failed_checks += 1
if check['critical']:
failed_critical += 1
if bad:
# We append the entire record, so we can get the full context.
failed_records.append(rec)
passed_checks += 1
failed_records.append(critical)
checks = passed_checks + failed_checks
if len(failed_records) > 0:
message = "Found {}/{} failed checks ({} critical) over the last {} hours!".format(failed_checks, checks,
failed_critical, CHECK_HOURS)
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log")
notifyTeam = failed_critical > 0
await send_msg(client, message, file=file, force_notify=notifyTeam)
return
for verbose in json_verbose:
time = datetime.strptime(verbose["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
if time < time_limit:
continue
bad = False
for check in verbose["checks"]:
verbose_checks_total += 1
if check["up"] == False:
verbose_checks_failed += 1
bad = True
if bad:
failed_records.append(verbose)
# Send an informational heartbeat if all checks passed but only if it's in
# the first CHECK_HOURS hours of the day, essentially the first call.
if datetime.now().hour < CHECK_HOURS:
await send_msg(client, "Health checks passed: {}/{}\n".format(passed_checks, checks))
################################################################################
################ create a message
################################################################################
message = ""
force_notify = False
if json_check["disabled"]:
message += "__Portal manually disabled!__ "
force_notify = True
elif res_check.status_code is not requests.codes["ok"]:
message += "__Portal down!!!__ "
force_notify = True
if critical_checks_failed:
message += "{}/{} CRITICAL checks failed over the last {} hours! ".format(
critical_checks_failed, critical_checks_total, CHECK_HOURS
)
else:
message += "All {} critical checks passed. ".format(critical_checks_total)
if verbose_checks_failed:
message += "{}/{} verbose checks failed over the last {} hours! ".format(
verbose_checks_failed, verbose_checks_total, CHECK_HOURS
)
else:
message += "All {} verbose checks passed. ".format(verbose_checks_total)
if len(failed_records):
failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if force_notify or failed_records_file or datetime.utcnow().hour == 1:
return await send_msg(
client, message, file=failed_records_file, force_notify=force_notify
)
client.run(bot_token)

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python3
import discord, sys, traceback, io, os, asyncio
from bot_utils import setup, send_msg
from datetime import datetime, timedelta
from bot_utils import setup, send_msg, upload_to_skynet
from subprocess import Popen, PIPE
"""
@ -61,43 +60,47 @@ async def run_checks():
async def check_docker_logs():
print("\nChecking docker logs...")
now = datetime.now()
time = now - timedelta(hours=CHECK_HOURS)
time_string = "{}h".format(CHECK_HOURS)
since_string = "{}h".format(CHECK_HOURS)
# Read the logs.
print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME))
proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True)
print(
"[DEBUG] Will run `docker logs --since {} {}`".format(
since_string, CONTAINER_NAME
)
)
proc = Popen(
["docker", "logs", "--since", since_string, CONTAINER_NAME],
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
text=True,
)
std_out, std_err = proc.communicate()
if len(std_err) > 0:
# Trim the error log to under 1MB.
one_mb = 1024*1024
one_mb = 1024 * 1024
if len(std_err) > one_mb:
pos = std_err.find("\n", -one_mb)
std_err = std_err[pos+1:]
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
# Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded
# down to the nearest new line. This is a limitation in the size of
# Discord messages - they can be at most 2000 characters long (and we
# send some extra characters before the error log).
if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
std_err = std_err[pos+1:]
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
return
std_err = std_err[pos + 1 :]
return await send_msg(
client, "Error(s) found in log!", file=std_err, force_notify=True
)
# If there are any critical or severe errors. upload the whole log file.
if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out:
upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True)
return
if "Critical" in std_out or "Severe" in std_out or "panic" in std_out:
return await send_msg(
client,
"Critical or Severe error found in log!",
file=std_out,
force_notify=True,
)
# No critical or severe errors, return a heartbeat type message
pretty_before = time.strftime("%I:%M%p")
pretty_now = now.strftime("%I:%M%p")
await send_msg(client, "No critical or severe warnings in log from `{}` to `{}`".format(pretty_before, pretty_now))
return await send_msg(
client,
"No critical or severe warnings in log since `{}` hours".format(CHECK_HOURS),
)
client.run(bot_token)

View File

@ -5,7 +5,7 @@ set -e # exit on first error
sudo apt-get update
sudo apt-get -y install python3-pip
pip3 install discord.py python-dotenv requests pytz tzlocal
pip3 install discord.py python-dotenv requests
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"