reimplement health checks (#434)

This commit is contained in:
Karol Wypchło 2020-09-29 12:32:45 +02:00 committed by GitHub
parent 5e9f88bfce
commit 10a251c081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 210 additions and 123 deletions

View File

@ -3,8 +3,9 @@
from urllib.request import urlopen, Request from urllib.request import urlopen, Request
from dotenv import load_dotenv from dotenv import load_dotenv
from pathlib import Path from pathlib import Path
from datetime import datetime
import urllib, json, os, traceback, discord, sys import urllib, json, os, traceback, discord, sys, re, subprocess, requests, io
# sc_precision is the number of hastings per siacoin # sc_precision is the number of hastings per siacoin
sc_precision = 10 ** 24 sc_precision = 10 ** 24
@ -17,6 +18,22 @@ api_endpoint, port, portal_name, bot_token, password = None, None, None, None, N
discord_client = None discord_client = None
setup_done = False setup_done = False
# Get the container name as an argument or use "sia" as default.
CONTAINER_NAME = "sia"
if len(sys.argv) > 2:
CONTAINER_NAME = sys.argv[2]
# find out local siad ip by inspecting its docker container
def get_api_ip():
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
docker_cmd = (
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
+ CONTAINER_NAME
)
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
return ip_regex.findall(output)[0]
def setup(): def setup():
# Load dotenv file if possible. # Load dotenv file if possible.
# TODO: change all scripts to use named flags/params # TODO: change all scripts to use named flags/params
@ -37,7 +54,7 @@ def setup():
port = "9980" port = "9980"
global api_endpoint global api_endpoint
api_endpoint = "http://localhost:{}".format(port) api_endpoint = "http://{}:{}".format(get_api_ip(), port)
siad.initialize() siad.initialize()
@ -46,6 +63,7 @@ def setup():
return bot_token return bot_token
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here". # send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here".
async def send_msg(client, msg, force_notify=False, file=None): async def send_msg(client, msg, force_notify=False, file=None):
await client.wait_until_ready() await client.wait_until_ready()
@ -69,14 +87,48 @@ async def send_msg(client, msg, force_notify=False, file=None):
break break
# Add the portal name. # Add the portal name.
msg = "`{}`: {}".format(portal_name, msg) msg = "**{}**: {}".format(portal_name, msg)
if isinstance(file, str):
is_json = is_json_string(file)
content_type = "application/json" if is_json else "text/plain"
ext = "json" if is_json else "txt"
filename = "{}-{}.{}".format(
CONTAINER_NAME, datetime.utcnow().strftime("%Y-%m-%d-%H:%M:%S"), ext
)
skylink = upload_to_skynet(file, filename, content_type=content_type)
if skylink:
msg = "{} {}".format(msg, skylink) # append skylink to message
file = None # clean file reference, we're using a skylink
else:
file = discord.File(
io.BytesIO(file.encode()), filename=filename
) # wrap text into discord file wrapper
if force_notify: if force_notify:
msg = "{}: \n{}".format(role.mention, msg) msg = "{} /cc {}".format(msg, role.mention)
await chan.send(msg, file=file) await chan.send(msg, file=file)
#siad class provides wrappers for the necessary siad commands. def upload_to_skynet(contents, filename="file.txt", content_type="text/plain"):
files = {"file": (filename, contents, content_type)}
res = requests.post("https://siasky.net/skynet/skyfile", files=files)
if res.status_code == requests.codes["ok"]:
res_json = res.json()
return "https://siasky.net/" + res_json["skylink"]
return None
def is_json_string(str):
try:
json.loads(str)
return True
except ValueError:
return False
# siad class provides wrappers for the necessary siad commands.
class siad: class siad:
# initializes values for using the API (password and # initializes values for using the API (password and
# user-agent) so that all calls to urllib.request.urlopen have these set. # user-agent) so that all calls to urllib.request.urlopen have these set.
@ -90,7 +142,7 @@ class siad:
# Setup an opener with the correct user agent # Setup an opener with the correct user agent
opener = urllib.request.build_opener(handler) opener = urllib.request.build_opener(handler)
opener.addheaders = [('User-agent', 'Sia-Agent')] opener.addheaders = [("User-agent", "Sia-Agent")]
# Install the opener. # Install the opener.
# Now all calls to urllib.request.urlopen use our opener. # Now all calls to urllib.request.urlopen use our opener.
@ -102,7 +154,7 @@ class siad:
password = os.getenv("SIA_API_PASSWORD") password = os.getenv("SIA_API_PASSWORD")
if not password: if not password:
home = os.getenv("HOME") home = os.getenv("HOME")
password_file = open(home+"/.sia/apipassword") password_file = open(home + "/.sia/apipassword")
password = password_file.readlines()[0].strip() password = password_file.readlines()[0].strip()
return password return password
@ -111,26 +163,26 @@ class siad:
def load_json(resp): def load_json(resp):
return json.loads(resp.decode("utf-8")) return json.loads(resp.decode("utf-8"))
@staticmethod @staticmethod
def get_wallet(): def get_wallet():
if not setup_done: setup() if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/wallet").read() resp = urllib.request.urlopen(api_endpoint + "/wallet").read()
return siad.load_json(resp) return siad.load_json(resp)
@staticmethod @staticmethod
def get_renter(): def get_renter():
if not setup_done: setup() if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/renter").read() resp = urllib.request.urlopen(api_endpoint + "/renter").read()
return siad.load_json(resp) return siad.load_json(resp)
@staticmethod @staticmethod
def get_renter_contracts(): def get_renter_contracts():
if not setup_done: setup() if not setup_done:
setup()
resp = urllib.request.urlopen(api_endpoint + "/renter/contracts").read() resp = urllib.request.urlopen(api_endpoint + "/renter/contracts").read()
return siad.load_json(resp) return siad.load_json(resp)

View File

@ -27,7 +27,6 @@ async def run_checks():
print("Running Skynet portal funds checks") print("Running Skynet portal funds checks")
try: try:
await check_funds() await check_funds()
except: # catch all exceptions except: # catch all exceptions
trace = traceback.format_exc() trace = traceback.format_exc()
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True) await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
@ -41,38 +40,43 @@ async def check_funds():
wallet_get = siad.get_wallet() wallet_get = siad.get_wallet()
renter_get = siad.get_renter() renter_get = siad.get_renter()
if not wallet_get['unlocked']: if not wallet_get["unlocked"]:
await send_msg(client, "Wallet locked", force_notify=True) await send_msg(client, "Wallet locked", force_notify=True)
return return
confirmed_coins = int(wallet_get['confirmedsiacoinbalance']) confirmed_coins = int(wallet_get["confirmedsiacoinbalance"])
unconfirmed_coins = int(wallet_get['unconfirmedincomingsiacoins']) unconfirmed_coins = int(wallet_get["unconfirmedincomingsiacoins"])
unconfirmed_outgoing_coins = int(wallet_get['unconfirmedoutgoingsiacoins']) unconfirmed_outgoing_coins = int(wallet_get["unconfirmedoutgoingsiacoins"])
balance = confirmed_coins + unconfirmed_coins - unconfirmed_outgoing_coins balance = confirmed_coins + unconfirmed_coins - unconfirmed_outgoing_coins
print("Balance: ", balance / sc_precision) print("Balance: ", balance / sc_precision)
allowance = renter_get['settings']['allowance'] allowance = renter_get["settings"]["allowance"]
allowance_funds = int(allowance['funds']) allowance_funds = int(allowance["funds"])
allocated_funds = int(renter_get['financialmetrics']['totalallocated']) allocated_funds = int(renter_get["financialmetrics"]["totalallocated"])
unallocated_funds = allowance_funds - allocated_funds unallocated_funds = allowance_funds - allocated_funds
balance_msg = "Balance: {} SC, Allowance Funds: {} SC".format(
balance_msg = "Balance: `{} SC` Allowance Funds: `{} SC`".format(round(balance/sc_precision), round(allowance_funds/sc_precision)) round(balance / sc_precision), round(allowance_funds / sc_precision)
alloc_msg = "Unallocated: `{} SC`\nAllocated: `{} SC`".format(round(unallocated_funds/sc_precision), round(allocated_funds/sc_precision)) )
alloc_msg = "Unallocated: {} SC, Allocated: {} SC".format(
round(unallocated_funds / sc_precision), round(allocated_funds / sc_precision)
)
# Send an alert if there is less than 1 allowance worth of money left. # Send an alert if there is less than 1 allowance worth of money left.
if balance < allowance_funds: if balance < allowance_funds:
await send_msg(client, "Wallet balance running low. \n{}".format(balance_msg), force_notify=True) message = "__Wallet balance running low!__ {}".format(balance_msg)
return return await send_msg(client, message, force_notify=True)
# Alert devs when only a fraction of the allowance is remaining. # Alert devs when only a fraction of the allowance is remaining.
SPEND_THRESHOLD = 0.8 SPEND_THRESHOLD = 0.8
if allocated_funds >= SPEND_THRESHOLD * allowance_funds : if allocated_funds >= SPEND_THRESHOLD * allowance_funds:
await send_msg(client, "More than {:.0%} of allowance spent: \n{}".format(SPEND_THRESHOLD, alloc_msg), force_notify=True) message = "__More than {:.0%} of allowance spent!__ {}".format(
return SPEND_THRESHOLD, alloc_msg
)
return await send_msg(client, message, force_notify=True)
# Send an informational heartbeat if all checks passed. # Send an informational heartbeat if all checks passed.
await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg)) await send_msg(client, "Funds checks passed. {} {}".format(balance_msg, alloc_msg))
client.run(bot_token) client.run(bot_token)

View File

@ -1,19 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import asyncio import asyncio, json, os, re, sys, traceback, discord, requests
import io
import json
import os
import re
import sys
import traceback
from datetime import datetime, timedelta from datetime import datetime, timedelta
import discord
import pytz.reference
import requests
from bot_utils import setup, send_msg from bot_utils import setup, send_msg
from tzlocal import get_localzone
""" """
health-checker reads the /health-check endpoint of the portal and dispatches health-checker reads the /health-check endpoint of the portal and dispatches
@ -55,19 +44,19 @@ async def run_checks():
try: try:
await check_load_average() await check_load_average()
await check_disk() await check_disk()
# await check_health() # FIXME: adjust it to work with https://github.com/NebulousLabs/skynet-webportal/pull/389 await check_health()
except: except:
trace = traceback.format_exc() trace = traceback.format_exc()
print("[DEBUG] run_checks() failed.") print("[DEBUG] run_checks() failed.")
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: await send_msg(
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) client,
else: "Failed to run the portal health checks!",
await send_msg(client, "Failed to run the portal health checks!", file=trace,
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), force_notify=True,
force_notify=True) )
# check_load_average monitors the system's load average value and issues a # check_load_average monitors the system load average value and issues a
# warning message if it exceeds 10. # warning message if it exceeds 10.
async def check_load_average(): async def check_load_average():
uptime_string = os.popen("uptime").read().strip() uptime_string = os.popen("uptime").read().strip()
@ -77,7 +66,8 @@ async def check_load_average():
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$" pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1) load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10: if float(load_av) > 10:
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True) message = "High system load detected in uptime output: {}".format(uptime_string)
await send_msg(client, message, force_notify=True)
# check_disk checks the amount of free space on the /home partition and issues # check_disk checks the amount of free space on the /home partition and issues
@ -100,13 +90,12 @@ async def check_disk():
vol = mp vol = mp
break break
if vol == "": if vol == "":
msg = "Failed to check free disk space! Didn't find a suitable mount point to check.\ndf output:\n{}".format(df) message = "Failed to check free disk space! Didn't find a suitable mount point to check."
await send_msg(client, msg) return await send_msg(client, message, file=df)
return
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD: if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True) message = "WARNING! Low disk space: {}GiB".format(free_space_gb)
return return await send_msg(client, message, force_notify=True)
# check_health checks /health-check endpoint and reports recent issues # check_health checks /health-check endpoint and reports recent issues
@ -114,55 +103,94 @@ async def check_health():
print("\nChecking portal health status...") print("\nChecking portal health status...")
try: try:
res = requests.get("http://localhost/health-check", verify=False) res_check = requests.get("http://localhost/health-check", verify=False)
json_check = res_check.json()
json_critical = requests.get(
"http://localhost/health-check/critical", verify=False
).json()
json_verbose = requests.get(
"http://localhost/health-check/verbose", verify=False
).json()
except: except:
trace = traceback.format_exc() trace = traceback.format_exc()
print("[DEBUG] check_health() failed.") print("[DEBUG] check_health() failed.")
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH: return await send_msg(
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False) client, "Failed to run the checks!", file=trace, force_notify=True
else: )
await send_msg(client, "Failed to run the checks!",
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"), critical_checks_total = 0
force_notify=True) critical_checks_failed = 0
return
verbose_checks_total = 0
verbose_checks_failed = 0
# Check the health records.
passed_checks = 0
failed_checks = 0
failed_critical = 0
failed_records = [] failed_records = []
time_limit_unaware = datetime.now() - timedelta(hours=CHECK_HOURS) # local time failed_records_file = None
time_limit = time_limit_unaware.astimezone(get_localzone()) # time with time zone
for rec in res.json(): time_limit = datetime.utcnow() - timedelta(hours=CHECK_HOURS)
time_unaware = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') # time in UTC
time = pytz.utc.localize(time_unaware) # time with time zone for critical in json_critical:
time = datetime.strptime(critical["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
if time < time_limit: if time < time_limit:
continue continue
bad = False bad = False
for check in rec['checks']: for check in critical["checks"]:
if check['up'] == False: critical_checks_total += 1
if check["up"] == False:
critical_checks_failed += 1
bad = True bad = True
failed_checks += 1
if check['critical']:
failed_critical += 1
if bad: if bad:
# We append the entire record, so we can get the full context. failed_records.append(critical)
failed_records.append(rec)
passed_checks += 1
checks = passed_checks + failed_checks for verbose in json_verbose:
if len(failed_records) > 0: time = datetime.strptime(verbose["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
message = "Found {}/{} failed checks ({} critical) over the last {} hours!".format(failed_checks, checks, if time < time_limit:
failed_critical, CHECK_HOURS) continue
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log") bad = False
notifyTeam = failed_critical > 0 for check in verbose["checks"]:
await send_msg(client, message, file=file, force_notify=notifyTeam) verbose_checks_total += 1
return if check["up"] == False:
verbose_checks_failed += 1
bad = True
if bad:
failed_records.append(verbose)
# Send an informational heartbeat if all checks passed but only if it's in ################################################################################
# the first CHECK_HOURS hours of the day, essentially the first call. ################ create a message
if datetime.now().hour < CHECK_HOURS: ################################################################################
await send_msg(client, "Health checks passed: {}/{}\n".format(passed_checks, checks))
message = ""
force_notify = False
if json_check["disabled"]:
message += "__Portal manually disabled!__ "
force_notify = True
elif res_check.status_code is not requests.codes["ok"]:
message += "__Portal down!!!__ "
force_notify = True
if critical_checks_failed:
message += "{}/{} CRITICAL checks failed over the last {} hours! ".format(
critical_checks_failed, critical_checks_total, CHECK_HOURS
)
else:
message += "All {} critical checks passed. ".format(critical_checks_total)
if verbose_checks_failed:
message += "{}/{} verbose checks failed over the last {} hours! ".format(
verbose_checks_failed, verbose_checks_total, CHECK_HOURS
)
else:
message += "All {} verbose checks passed. ".format(verbose_checks_total)
if len(failed_records):
failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if force_notify or failed_records_file or datetime.utcnow().hour == 1:
return await send_msg(
client, message, file=failed_records_file, force_notify=force_notify
)
client.run(bot_token) client.run(bot_token)

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import discord, sys, traceback, io, os, asyncio import discord, sys, traceback, io, os, asyncio
from bot_utils import setup, send_msg from bot_utils import setup, send_msg, upload_to_skynet
from datetime import datetime, timedelta
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
""" """
@ -61,43 +60,47 @@ async def run_checks():
async def check_docker_logs(): async def check_docker_logs():
print("\nChecking docker logs...") print("\nChecking docker logs...")
now = datetime.now() since_string = "{}h".format(CHECK_HOURS)
time = now - timedelta(hours=CHECK_HOURS)
time_string = "{}h".format(CHECK_HOURS)
# Read the logs. # Read the logs.
print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME)) print(
proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) "[DEBUG] Will run `docker logs --since {} {}`".format(
since_string, CONTAINER_NAME
)
)
proc = Popen(
["docker", "logs", "--since", since_string, CONTAINER_NAME],
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
text=True,
)
std_out, std_err = proc.communicate() std_out, std_err = proc.communicate()
if len(std_err) > 0: if len(std_err) > 0:
# Trim the error log to under 1MB. # Trim the error log to under 1MB.
one_mb = 1024*1024 one_mb = 1024 * 1024
if len(std_err) > one_mb: if len(std_err) > one_mb:
pos = std_err.find("\n", -one_mb) pos = std_err.find("\n", -one_mb)
std_err = std_err[pos+1:] std_err = std_err[pos + 1 :]
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) return await send_msg(
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True) client, "Error(s) found in log!", file=std_err, force_notify=True
# Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded )
# down to the nearest new line. This is a limitation in the size of
# Discord messages - they can be at most 2000 characters long (and we
# send some extra characters before the error log).
if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
std_err = std_err[pos+1:]
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
return
# If there are any critical or severe errors. upload the whole log file. # If there are any critical or severe errors. upload the whole log file.
if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out: if "Critical" in std_out or "Severe" in std_out or "panic" in std_out:
upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second) return await send_msg(
await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True) client,
return "Critical or Severe error found in log!",
file=std_out,
force_notify=True,
)
# No critical or severe errors, return a heartbeat type message # No critical or severe errors, return a heartbeat type message
pretty_before = time.strftime("%I:%M%p") return await send_msg(
pretty_now = now.strftime("%I:%M%p") client,
await send_msg(client, "No critical or severe warnings in log from `{}` to `{}`".format(pretty_before, pretty_now)) "No critical or severe warnings in log since `{}` hours".format(CHECK_HOURS),
)
client.run(bot_token) client.run(bot_token)

View File

@ -5,7 +5,7 @@ set -e # exit on first error
sudo apt-get update sudo apt-get update
sudo apt-get -y install python3-pip sudo apt-get -y install python3-pip
pip3 install discord.py python-dotenv requests pytz tzlocal pip3 install discord.py python-dotenv requests
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env" fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8" logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"