reimplement health checks (#434)
This commit is contained in:
parent
5e9f88bfce
commit
10a251c081
|
@ -3,8 +3,9 @@
|
||||||
from urllib.request import urlopen, Request
|
from urllib.request import urlopen, Request
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import urllib, json, os, traceback, discord, sys
|
import urllib, json, os, traceback, discord, sys, re, subprocess, requests, io
|
||||||
|
|
||||||
# sc_precision is the number of hastings per siacoin
|
# sc_precision is the number of hastings per siacoin
|
||||||
sc_precision = 10 ** 24
|
sc_precision = 10 ** 24
|
||||||
|
@ -17,6 +18,22 @@ api_endpoint, port, portal_name, bot_token, password = None, None, None, None, N
|
||||||
discord_client = None
|
discord_client = None
|
||||||
setup_done = False
|
setup_done = False
|
||||||
|
|
||||||
|
# Get the container name as an argument or use "sia" as default.
|
||||||
|
CONTAINER_NAME = "sia"
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
CONTAINER_NAME = sys.argv[2]
|
||||||
|
|
||||||
|
# find out local siad ip by inspecting its docker container
|
||||||
|
def get_api_ip():
|
||||||
|
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
|
||||||
|
docker_cmd = (
|
||||||
|
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
|
||||||
|
+ CONTAINER_NAME
|
||||||
|
)
|
||||||
|
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
|
||||||
|
return ip_regex.findall(output)[0]
|
||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup():
|
||||||
# Load dotenv file if possible.
|
# Load dotenv file if possible.
|
||||||
# TODO: change all scripts to use named flags/params
|
# TODO: change all scripts to use named flags/params
|
||||||
|
@ -37,7 +54,7 @@ def setup():
|
||||||
port = "9980"
|
port = "9980"
|
||||||
|
|
||||||
global api_endpoint
|
global api_endpoint
|
||||||
api_endpoint = "http://localhost:{}".format(port)
|
api_endpoint = "http://{}:{}".format(get_api_ip(), port)
|
||||||
|
|
||||||
siad.initialize()
|
siad.initialize()
|
||||||
|
|
||||||
|
@ -46,6 +63,7 @@ def setup():
|
||||||
|
|
||||||
return bot_token
|
return bot_token
|
||||||
|
|
||||||
|
|
||||||
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here".
|
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here".
|
||||||
async def send_msg(client, msg, force_notify=False, file=None):
|
async def send_msg(client, msg, force_notify=False, file=None):
|
||||||
await client.wait_until_ready()
|
await client.wait_until_ready()
|
||||||
|
@ -69,14 +87,48 @@ async def send_msg(client, msg, force_notify=False, file=None):
|
||||||
break
|
break
|
||||||
|
|
||||||
# Add the portal name.
|
# Add the portal name.
|
||||||
msg = "`{}`: {}".format(portal_name, msg)
|
msg = "**{}**: {}".format(portal_name, msg)
|
||||||
|
|
||||||
|
if isinstance(file, str):
|
||||||
|
is_json = is_json_string(file)
|
||||||
|
content_type = "application/json" if is_json else "text/plain"
|
||||||
|
ext = "json" if is_json else "txt"
|
||||||
|
filename = "{}-{}.{}".format(
|
||||||
|
CONTAINER_NAME, datetime.utcnow().strftime("%Y-%m-%d-%H:%M:%S"), ext
|
||||||
|
)
|
||||||
|
skylink = upload_to_skynet(file, filename, content_type=content_type)
|
||||||
|
if skylink:
|
||||||
|
msg = "{} {}".format(msg, skylink) # append skylink to message
|
||||||
|
file = None # clean file reference, we're using a skylink
|
||||||
|
else:
|
||||||
|
file = discord.File(
|
||||||
|
io.BytesIO(file.encode()), filename=filename
|
||||||
|
) # wrap text into discord file wrapper
|
||||||
|
|
||||||
if force_notify:
|
if force_notify:
|
||||||
msg = "{}: \n{}".format(role.mention, msg)
|
msg = "{} /cc {}".format(msg, role.mention)
|
||||||
|
|
||||||
await chan.send(msg, file=file)
|
await chan.send(msg, file=file)
|
||||||
|
|
||||||
|
|
||||||
#siad class provides wrappers for the necessary siad commands.
|
def upload_to_skynet(contents, filename="file.txt", content_type="text/plain"):
|
||||||
|
files = {"file": (filename, contents, content_type)}
|
||||||
|
res = requests.post("https://siasky.net/skynet/skyfile", files=files)
|
||||||
|
if res.status_code == requests.codes["ok"]:
|
||||||
|
res_json = res.json()
|
||||||
|
return "https://siasky.net/" + res_json["skylink"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_json_string(str):
|
||||||
|
try:
|
||||||
|
json.loads(str)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# siad class provides wrappers for the necessary siad commands.
|
||||||
class siad:
|
class siad:
|
||||||
# initializes values for using the API (password and
|
# initializes values for using the API (password and
|
||||||
# user-agent) so that all calls to urllib.request.urlopen have these set.
|
# user-agent) so that all calls to urllib.request.urlopen have these set.
|
||||||
|
@ -90,7 +142,7 @@ class siad:
|
||||||
|
|
||||||
# Setup an opener with the correct user agent
|
# Setup an opener with the correct user agent
|
||||||
opener = urllib.request.build_opener(handler)
|
opener = urllib.request.build_opener(handler)
|
||||||
opener.addheaders = [('User-agent', 'Sia-Agent')]
|
opener.addheaders = [("User-agent", "Sia-Agent")]
|
||||||
|
|
||||||
# Install the opener.
|
# Install the opener.
|
||||||
# Now all calls to urllib.request.urlopen use our opener.
|
# Now all calls to urllib.request.urlopen use our opener.
|
||||||
|
@ -102,7 +154,7 @@ class siad:
|
||||||
password = os.getenv("SIA_API_PASSWORD")
|
password = os.getenv("SIA_API_PASSWORD")
|
||||||
if not password:
|
if not password:
|
||||||
home = os.getenv("HOME")
|
home = os.getenv("HOME")
|
||||||
password_file = open(home+"/.sia/apipassword")
|
password_file = open(home + "/.sia/apipassword")
|
||||||
password = password_file.readlines()[0].strip()
|
password = password_file.readlines()[0].strip()
|
||||||
return password
|
return password
|
||||||
|
|
||||||
|
@ -111,26 +163,26 @@ class siad:
|
||||||
def load_json(resp):
|
def load_json(resp):
|
||||||
return json.loads(resp.decode("utf-8"))
|
return json.loads(resp.decode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_wallet():
|
def get_wallet():
|
||||||
if not setup_done: setup()
|
if not setup_done:
|
||||||
|
setup()
|
||||||
|
|
||||||
resp = urllib.request.urlopen(api_endpoint + "/wallet").read()
|
resp = urllib.request.urlopen(api_endpoint + "/wallet").read()
|
||||||
return siad.load_json(resp)
|
return siad.load_json(resp)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_renter():
|
def get_renter():
|
||||||
if not setup_done: setup()
|
if not setup_done:
|
||||||
|
setup()
|
||||||
|
|
||||||
resp = urllib.request.urlopen(api_endpoint + "/renter").read()
|
resp = urllib.request.urlopen(api_endpoint + "/renter").read()
|
||||||
return siad.load_json(resp)
|
return siad.load_json(resp)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_renter_contracts():
|
def get_renter_contracts():
|
||||||
if not setup_done: setup()
|
if not setup_done:
|
||||||
|
setup()
|
||||||
|
|
||||||
resp = urllib.request.urlopen(api_endpoint + "/renter/contracts").read()
|
resp = urllib.request.urlopen(api_endpoint + "/renter/contracts").read()
|
||||||
return siad.load_json(resp)
|
return siad.load_json(resp)
|
||||||
|
|
|
@ -27,8 +27,7 @@ async def run_checks():
|
||||||
print("Running Skynet portal funds checks")
|
print("Running Skynet portal funds checks")
|
||||||
try:
|
try:
|
||||||
await check_funds()
|
await check_funds()
|
||||||
|
except: # catch all exceptions
|
||||||
except: # catch all exceptions
|
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=True)
|
||||||
|
|
||||||
|
@ -41,38 +40,43 @@ async def check_funds():
|
||||||
wallet_get = siad.get_wallet()
|
wallet_get = siad.get_wallet()
|
||||||
renter_get = siad.get_renter()
|
renter_get = siad.get_renter()
|
||||||
|
|
||||||
if not wallet_get['unlocked']:
|
if not wallet_get["unlocked"]:
|
||||||
await send_msg(client, "Wallet locked", force_notify=True)
|
await send_msg(client, "Wallet locked", force_notify=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
confirmed_coins = int(wallet_get['confirmedsiacoinbalance'])
|
confirmed_coins = int(wallet_get["confirmedsiacoinbalance"])
|
||||||
unconfirmed_coins = int(wallet_get['unconfirmedincomingsiacoins'])
|
unconfirmed_coins = int(wallet_get["unconfirmedincomingsiacoins"])
|
||||||
unconfirmed_outgoing_coins = int(wallet_get['unconfirmedoutgoingsiacoins'])
|
unconfirmed_outgoing_coins = int(wallet_get["unconfirmedoutgoingsiacoins"])
|
||||||
balance = confirmed_coins + unconfirmed_coins - unconfirmed_outgoing_coins
|
balance = confirmed_coins + unconfirmed_coins - unconfirmed_outgoing_coins
|
||||||
print("Balance: ", balance / sc_precision)
|
print("Balance: ", balance / sc_precision)
|
||||||
|
|
||||||
allowance = renter_get['settings']['allowance']
|
allowance = renter_get["settings"]["allowance"]
|
||||||
allowance_funds = int(allowance['funds'])
|
allowance_funds = int(allowance["funds"])
|
||||||
allocated_funds = int(renter_get['financialmetrics']['totalallocated'])
|
allocated_funds = int(renter_get["financialmetrics"]["totalallocated"])
|
||||||
unallocated_funds = allowance_funds - allocated_funds
|
unallocated_funds = allowance_funds - allocated_funds
|
||||||
|
|
||||||
|
balance_msg = "Balance: {} SC, Allowance Funds: {} SC".format(
|
||||||
balance_msg = "Balance: `{} SC` Allowance Funds: `{} SC`".format(round(balance/sc_precision), round(allowance_funds/sc_precision))
|
round(balance / sc_precision), round(allowance_funds / sc_precision)
|
||||||
alloc_msg = "Unallocated: `{} SC`\nAllocated: `{} SC`".format(round(unallocated_funds/sc_precision), round(allocated_funds/sc_precision))
|
)
|
||||||
|
alloc_msg = "Unallocated: {} SC, Allocated: {} SC".format(
|
||||||
|
round(unallocated_funds / sc_precision), round(allocated_funds / sc_precision)
|
||||||
|
)
|
||||||
|
|
||||||
# Send an alert if there is less than 1 allowance worth of money left.
|
# Send an alert if there is less than 1 allowance worth of money left.
|
||||||
if balance < allowance_funds:
|
if balance < allowance_funds:
|
||||||
await send_msg(client, "Wallet balance running low. \n{}".format(balance_msg), force_notify=True)
|
message = "__Wallet balance running low!__ {}".format(balance_msg)
|
||||||
return
|
return await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
# Alert devs when only a fraction of the allowance is remaining.
|
# Alert devs when only a fraction of the allowance is remaining.
|
||||||
SPEND_THRESHOLD = 0.8
|
SPEND_THRESHOLD = 0.8
|
||||||
if allocated_funds >= SPEND_THRESHOLD * allowance_funds :
|
if allocated_funds >= SPEND_THRESHOLD * allowance_funds:
|
||||||
await send_msg(client, "More than {:.0%} of allowance spent: \n{}".format(SPEND_THRESHOLD, alloc_msg), force_notify=True)
|
message = "__More than {:.0%} of allowance spent!__ {}".format(
|
||||||
return
|
SPEND_THRESHOLD, alloc_msg
|
||||||
|
)
|
||||||
|
return await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
# Send an informational heartbeat if all checks passed.
|
# Send an informational heartbeat if all checks passed.
|
||||||
await send_msg(client, "Funds checks passed:\n{} \n{}".format(balance_msg, alloc_msg))
|
await send_msg(client, "Funds checks passed. {} {}".format(balance_msg, alloc_msg))
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
|
@ -1,19 +1,8 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import asyncio
|
import asyncio, json, os, re, sys, traceback, discord, requests
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
import discord
|
|
||||||
import pytz.reference
|
|
||||||
import requests
|
|
||||||
from bot_utils import setup, send_msg
|
from bot_utils import setup, send_msg
|
||||||
from tzlocal import get_localzone
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
health-checker reads the /health-check endpoint of the portal and dispatches
|
health-checker reads the /health-check endpoint of the portal and dispatches
|
||||||
|
@ -55,19 +44,19 @@ async def run_checks():
|
||||||
try:
|
try:
|
||||||
await check_load_average()
|
await check_load_average()
|
||||||
await check_disk()
|
await check_disk()
|
||||||
# await check_health() # FIXME: adjust it to work with https://github.com/NebulousLabs/skynet-webportal/pull/389
|
await check_health()
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
print("[DEBUG] run_checks() failed.")
|
print("[DEBUG] run_checks() failed.")
|
||||||
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
await send_msg(
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
client,
|
||||||
else:
|
"Failed to run the portal health checks!",
|
||||||
await send_msg(client, "Failed to run the portal health checks!",
|
file=trace,
|
||||||
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
force_notify=True,
|
||||||
force_notify=True)
|
)
|
||||||
|
|
||||||
|
|
||||||
# check_load_average monitors the system's load average value and issues a
|
# check_load_average monitors the system load average value and issues a
|
||||||
# warning message if it exceeds 10.
|
# warning message if it exceeds 10.
|
||||||
async def check_load_average():
|
async def check_load_average():
|
||||||
uptime_string = os.popen("uptime").read().strip()
|
uptime_string = os.popen("uptime").read().strip()
|
||||||
|
@ -77,7 +66,8 @@ async def check_load_average():
|
||||||
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
||||||
load_av = re.match(pattern, uptime_string).group(1)
|
load_av = re.match(pattern, uptime_string).group(1)
|
||||||
if float(load_av) > 10:
|
if float(load_av) > 10:
|
||||||
await send_msg(client, "High system load detected: `uptime: {}`".format(uptime_string), force_notify=True)
|
message = "High system load detected in uptime output: {}".format(uptime_string)
|
||||||
|
await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
# check_disk checks the amount of free space on the /home partition and issues
|
# check_disk checks the amount of free space on the /home partition and issues
|
||||||
|
@ -100,13 +90,12 @@ async def check_disk():
|
||||||
vol = mp
|
vol = mp
|
||||||
break
|
break
|
||||||
if vol == "":
|
if vol == "":
|
||||||
msg = "Failed to check free disk space! Didn't find a suitable mount point to check.\ndf output:\n{}".format(df)
|
message = "Failed to check free disk space! Didn't find a suitable mount point to check."
|
||||||
await send_msg(client, msg)
|
return await send_msg(client, message, file=df)
|
||||||
return
|
|
||||||
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
|
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD:
|
||||||
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
|
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
|
||||||
await send_msg(client, "WARNING! Low disk space: {}GiB".format(free_space_gb), force_notify=True)
|
message = "WARNING! Low disk space: {}GiB".format(free_space_gb)
|
||||||
return
|
return await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
# check_health checks /health-check endpoint and reports recent issues
|
# check_health checks /health-check endpoint and reports recent issues
|
||||||
|
@ -114,55 +103,94 @@ async def check_health():
|
||||||
print("\nChecking portal health status...")
|
print("\nChecking portal health status...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res = requests.get("http://localhost/health-check", verify=False)
|
res_check = requests.get("http://localhost/health-check", verify=False)
|
||||||
|
json_check = res_check.json()
|
||||||
|
json_critical = requests.get(
|
||||||
|
"http://localhost/health-check/critical", verify=False
|
||||||
|
).json()
|
||||||
|
json_verbose = requests.get(
|
||||||
|
"http://localhost/health-check/verbose", verify=False
|
||||||
|
).json()
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
print("[DEBUG] check_health() failed.")
|
print("[DEBUG] check_health() failed.")
|
||||||
if len(trace) < DISCORD_MAX_MESSAGE_LENGTH:
|
return await send_msg(
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
client, "Failed to run the checks!", file=trace, force_notify=True
|
||||||
else:
|
)
|
||||||
await send_msg(client, "Failed to run the checks!",
|
|
||||||
file=discord.File(io.BytesIO(trace.encode()), filename="failed_checks.log"),
|
critical_checks_total = 0
|
||||||
force_notify=True)
|
critical_checks_failed = 0
|
||||||
return
|
|
||||||
|
verbose_checks_total = 0
|
||||||
|
verbose_checks_failed = 0
|
||||||
|
|
||||||
# Check the health records.
|
|
||||||
passed_checks = 0
|
|
||||||
failed_checks = 0
|
|
||||||
failed_critical = 0
|
|
||||||
failed_records = []
|
failed_records = []
|
||||||
time_limit_unaware = datetime.now() - timedelta(hours=CHECK_HOURS) # local time
|
failed_records_file = None
|
||||||
time_limit = time_limit_unaware.astimezone(get_localzone()) # time with time zone
|
|
||||||
for rec in res.json():
|
time_limit = datetime.utcnow() - timedelta(hours=CHECK_HOURS)
|
||||||
time_unaware = datetime.strptime(rec['date'], '%Y-%m-%dT%H:%M:%S.%fZ') # time in UTC
|
|
||||||
time = pytz.utc.localize(time_unaware) # time with time zone
|
for critical in json_critical:
|
||||||
|
time = datetime.strptime(critical["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
if time < time_limit:
|
if time < time_limit:
|
||||||
continue
|
continue
|
||||||
bad = False
|
bad = False
|
||||||
for check in rec['checks']:
|
for check in critical["checks"]:
|
||||||
if check['up'] == False:
|
critical_checks_total += 1
|
||||||
|
if check["up"] == False:
|
||||||
|
critical_checks_failed += 1
|
||||||
bad = True
|
bad = True
|
||||||
failed_checks += 1
|
|
||||||
if check['critical']:
|
|
||||||
failed_critical += 1
|
|
||||||
if bad:
|
if bad:
|
||||||
# We append the entire record, so we can get the full context.
|
failed_records.append(critical)
|
||||||
failed_records.append(rec)
|
|
||||||
passed_checks += 1
|
|
||||||
|
|
||||||
checks = passed_checks + failed_checks
|
for verbose in json_verbose:
|
||||||
if len(failed_records) > 0:
|
time = datetime.strptime(verbose["date"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
message = "Found {}/{} failed checks ({} critical) over the last {} hours!".format(failed_checks, checks,
|
if time < time_limit:
|
||||||
failed_critical, CHECK_HOURS)
|
continue
|
||||||
file = discord.File(io.BytesIO(json.dumps(failed_records, indent=2).encode()), filename="failed_checks.log")
|
bad = False
|
||||||
notifyTeam = failed_critical > 0
|
for check in verbose["checks"]:
|
||||||
await send_msg(client, message, file=file, force_notify=notifyTeam)
|
verbose_checks_total += 1
|
||||||
return
|
if check["up"] == False:
|
||||||
|
verbose_checks_failed += 1
|
||||||
|
bad = True
|
||||||
|
if bad:
|
||||||
|
failed_records.append(verbose)
|
||||||
|
|
||||||
# Send an informational heartbeat if all checks passed but only if it's in
|
################################################################################
|
||||||
# the first CHECK_HOURS hours of the day, essentially the first call.
|
################ create a message
|
||||||
if datetime.now().hour < CHECK_HOURS:
|
################################################################################
|
||||||
await send_msg(client, "Health checks passed: {}/{}\n".format(passed_checks, checks))
|
|
||||||
|
message = ""
|
||||||
|
force_notify = False
|
||||||
|
|
||||||
|
if json_check["disabled"]:
|
||||||
|
message += "__Portal manually disabled!__ "
|
||||||
|
force_notify = True
|
||||||
|
elif res_check.status_code is not requests.codes["ok"]:
|
||||||
|
message += "__Portal down!!!__ "
|
||||||
|
force_notify = True
|
||||||
|
|
||||||
|
if critical_checks_failed:
|
||||||
|
message += "{}/{} CRITICAL checks failed over the last {} hours! ".format(
|
||||||
|
critical_checks_failed, critical_checks_total, CHECK_HOURS
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
message += "All {} critical checks passed. ".format(critical_checks_total)
|
||||||
|
|
||||||
|
if verbose_checks_failed:
|
||||||
|
message += "{}/{} verbose checks failed over the last {} hours! ".format(
|
||||||
|
verbose_checks_failed, verbose_checks_total, CHECK_HOURS
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
message += "All {} verbose checks passed. ".format(verbose_checks_total)
|
||||||
|
|
||||||
|
if len(failed_records):
|
||||||
|
failed_records_file = json.dumps(failed_records, indent=2)
|
||||||
|
|
||||||
|
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
|
||||||
|
if force_notify or failed_records_file or datetime.utcnow().hour == 1:
|
||||||
|
return await send_msg(
|
||||||
|
client, message, file=failed_records_file, force_notify=force_notify
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import discord, sys, traceback, io, os, asyncio
|
import discord, sys, traceback, io, os, asyncio
|
||||||
from bot_utils import setup, send_msg
|
from bot_utils import setup, send_msg, upload_to_skynet
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -52,7 +51,7 @@ async def run_checks():
|
||||||
print("Running Skynet portal log checks")
|
print("Running Skynet portal log checks")
|
||||||
try:
|
try:
|
||||||
await check_docker_logs()
|
await check_docker_logs()
|
||||||
except: # catch all exceptions
|
except: # catch all exceptions
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
await send_msg(client, "```\n{}\n```".format(trace), force_notify=False)
|
||||||
|
|
||||||
|
@ -61,43 +60,47 @@ async def run_checks():
|
||||||
async def check_docker_logs():
|
async def check_docker_logs():
|
||||||
print("\nChecking docker logs...")
|
print("\nChecking docker logs...")
|
||||||
|
|
||||||
now = datetime.now()
|
since_string = "{}h".format(CHECK_HOURS)
|
||||||
time = now - timedelta(hours=CHECK_HOURS)
|
|
||||||
time_string = "{}h".format(CHECK_HOURS)
|
|
||||||
|
|
||||||
# Read the logs.
|
# Read the logs.
|
||||||
print("[DEBUG] Will run `docker logs --since {} {}`".format(time_string, CONTAINER_NAME))
|
print(
|
||||||
proc = Popen(["docker", "logs", "--since", time_string, CONTAINER_NAME], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True)
|
"[DEBUG] Will run `docker logs --since {} {}`".format(
|
||||||
|
since_string, CONTAINER_NAME
|
||||||
|
)
|
||||||
|
)
|
||||||
|
proc = Popen(
|
||||||
|
["docker", "logs", "--since", since_string, CONTAINER_NAME],
|
||||||
|
stdin=PIPE,
|
||||||
|
stdout=PIPE,
|
||||||
|
stderr=PIPE,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
std_out, std_err = proc.communicate()
|
std_out, std_err = proc.communicate()
|
||||||
|
|
||||||
if len(std_err) > 0:
|
if len(std_err) > 0:
|
||||||
# Trim the error log to under 1MB.
|
# Trim the error log to under 1MB.
|
||||||
one_mb = 1024*1024
|
one_mb = 1024 * 1024
|
||||||
if len(std_err) > one_mb:
|
if len(std_err) > one_mb:
|
||||||
pos = std_err.find("\n", -one_mb)
|
pos = std_err.find("\n", -one_mb)
|
||||||
std_err = std_err[pos+1:]
|
std_err = std_err[pos + 1 :]
|
||||||
upload_name = "{}-{}-{}-{}-{}:{}:{}_err.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
return await send_msg(
|
||||||
await send_msg(client, "Error(s) found in log!", file=discord.File(io.BytesIO(std_err.encode()), filename=upload_name), force_notify=True)
|
client, "Error(s) found in log!", file=std_err, force_notify=True
|
||||||
# Send at most DISCORD_MAX_MESSAGE_LENGTH characters of logs, rounded
|
)
|
||||||
# down to the nearest new line. This is a limitation in the size of
|
|
||||||
# Discord messages - they can be at most 2000 characters long (and we
|
|
||||||
# send some extra characters before the error log).
|
|
||||||
if len(std_err) > DISCORD_MAX_MESSAGE_LENGTH:
|
|
||||||
pos = std_err.find("\n", -DISCORD_MAX_MESSAGE_LENGTH)
|
|
||||||
std_err = std_err[pos+1:]
|
|
||||||
await send_msg(client, "Error(s) preview:\n{}".format(std_err), force_notify=True)
|
|
||||||
return
|
|
||||||
|
|
||||||
# If there are any critical or severe errors. upload the whole log file.
|
# If there are any critical or severe errors. upload the whole log file.
|
||||||
if 'Critical' in std_out or 'Severe' in std_out or 'panic' in std_out:
|
if "Critical" in std_out or "Severe" in std_out or "panic" in std_out:
|
||||||
upload_name = "{}-{}-{}-{}-{}:{}:{}.log".format(CONTAINER_NAME, time.year, time.month, time.day, time.hour, time.minute, time.second)
|
return await send_msg(
|
||||||
await send_msg(client, "Critical or Severe error found in log!", file=discord.File(io.BytesIO(std_out.encode()), filename=upload_name), force_notify=True)
|
client,
|
||||||
return
|
"Critical or Severe error found in log!",
|
||||||
|
file=std_out,
|
||||||
|
force_notify=True,
|
||||||
|
)
|
||||||
|
|
||||||
# No critical or severe errors, return a heartbeat type message
|
# No critical or severe errors, return a heartbeat type message
|
||||||
pretty_before = time.strftime("%I:%M%p")
|
return await send_msg(
|
||||||
pretty_now = now.strftime("%I:%M%p")
|
client,
|
||||||
await send_msg(client, "No critical or severe warnings in log from `{}` to `{}`".format(pretty_before, pretty_now))
|
"No critical or severe warnings in log since `{}` hours".format(CHECK_HOURS),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
|
@ -5,7 +5,7 @@ set -e # exit on first error
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get -y install python3-pip
|
sudo apt-get -y install python3-pip
|
||||||
|
|
||||||
pip3 install discord.py python-dotenv requests pytz tzlocal
|
pip3 install discord.py python-dotenv requests
|
||||||
|
|
||||||
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
|
fundsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/funds-checker.py /home/user/skynet-webportal/.env"
|
||||||
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"
|
logsCheck="0 0,8,16 * * * /home/user/skynet-webportal/setup-scripts/log-checker.py /home/user/skynet-webportal/.env sia 8"
|
||||||
|
|
Reference in New Issue