Merge pull request #530 from NebulousLabs/sevey/parse-siac

Update Siafile Health Check to parse siac
This commit is contained in:
Ivaylo Novakov 2020-11-25 12:40:48 +01:00 committed by GitHub
commit 3919037277
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 123 additions and 41 deletions

View File

@ -1,7 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import asyncio, json, os, re, sys, traceback, discord, requests, time import asyncio
import json
import os
import re
import sys
import time
import traceback
from datetime import datetime, timedelta from datetime import datetime, timedelta
import discord
import requests
from bot_utils import setup, send_msg from bot_utils import setup, send_msg
""" """
@ -9,6 +18,11 @@ health-checker reads the /health-check endpoint of the portal and dispatches
messages to a Discord channel. messages to a Discord channel.
""" """
# Get the container name as an argument or use "sia" as default.
CONTAINER_NAME = "sia"
if len(sys.argv) > 2:
CONTAINER_NAME = sys.argv[2]
# Get the number of hours to look back in the logs or use 1 as default. # Get the number of hours to look back in the logs or use 1 as default.
CHECK_HOURS = 1 CHECK_HOURS = 1
if len(sys.argv) > 3: if len(sys.argv) > 3:
@ -47,6 +61,7 @@ async def run_checks():
await check_disk() await check_disk()
await check_health() await check_health()
await check_alerts() await check_alerts()
await check_portal_size()
except: except:
trace = traceback.format_exc() trace = traceback.format_exc()
print("[DEBUG] run_checks() failed.") print("[DEBUG] run_checks() failed.")
@ -68,7 +83,8 @@ async def check_load_average():
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$" pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1) load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10: if float(load_av) > 10:
message = "High system load detected in uptime output: {}".format(uptime_string) message = "High system load detected in uptime output: {}".format(
uptime_string)
await send_msg(client, message, force_notify=True) await send_msg(client, message, force_notify=True)
@ -78,6 +94,7 @@ async def check_disk():
# We check free disk space in 1024 byte units, so it's easy to convert. # We check free disk space in 1024 byte units, so it's easy to convert.
df = os.popen("df --block-size=1024").read().strip() df = os.popen("df --block-size=1024").read().strip()
volumes = {} volumes = {}
# Iterate over the output, ignoring the header line
for line in df.split("\n")[1:]: for line in df.split("\n")[1:]:
fields = list(filter(None, line.split(" "))) fields = list(filter(None, line.split(" ")))
# -1 is "mounted on", 3 is "available space" in KiB which we want in bytes # -1 is "mounted on", 3 is "available space" in KiB which we want in bytes
@ -99,13 +116,15 @@ async def check_disk():
# and shut down sia container so it doesn't get corrupted # and shut down sia container so it doesn't get corrupted
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL: if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(free_space_gb) message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
free_space_gb)
inspect = os.popen("docker inspect sia").read().strip() inspect = os.popen("docker inspect sia").read().strip()
inspect_json = json.loads(inspect) inspect_json = json.loads(inspect)
if inspect_json[0]["State"]["Running"] == True: if inspect_json[0]["State"]["Running"] == True:
os.popen("docker exec health-check cli/disable") # mark portal as unhealthy # mark portal as unhealthy
time.sleep(300) # wait 5 minutes to propagate dns changes os.popen("docker exec health-check cli/disable")
os.popen("docker stop sia") # stop sia container time.sleep(300) # wait 5 minutes to propagate dns changes
os.popen("docker stop sia") # stop sia container
return await send_msg(client, message, force_notify=True) return await send_msg(client, message, force_notify=True)
# if we're reached a free disk space threshold we need to send proper notice # if we're reached a free disk space threshold we need to send proper notice
@ -173,7 +192,7 @@ async def check_health():
failed_records.append(verbose) failed_records.append(verbose)
################################################################################ ################################################################################
################ create a message # create a message
################################################################################ ################################################################################
message = "" message = ""
@ -192,7 +211,8 @@ async def check_health():
) )
force_notify = True force_notify = True
else: else:
message += "All {} critical checks passed. ".format(critical_checks_total) message += "All {} critical checks passed. ".format(
critical_checks_total)
if verbose_checks_failed: if verbose_checks_failed:
message += "{}/{} verbose checks failed over the last {} hours! ".format( message += "{}/{} verbose checks failed over the last {} hours! ".format(
@ -200,7 +220,8 @@ async def check_health():
) )
force_notify = True force_notify = True
else: else:
message += "All {} verbose checks passed. ".format(verbose_checks_total) message += "All {} verbose checks passed. ".format(
verbose_checks_total)
if len(failed_records): if len(failed_records):
failed_records_file = json.dumps(failed_records, indent=2) failed_records_file = json.dumps(failed_records, indent=2)
@ -211,59 +232,120 @@ async def check_health():
client, message, file=failed_records_file, force_notify=force_notify client, message, file=failed_records_file, force_notify=force_notify
) )
# contains_string is a simple helper to check if a string contains a string.
# This is faster and easier than regex for word comparisons
def contains_string(string_to_check, string_to_find):
return string_to_find in string_to_check
# check_alerts checks the alerts returned from siad's daemon/alerts API # check_alerts checks the alerts returned from siad's daemon/alerts API
async def check_alerts(): async def check_alerts():
print("\nChecking portal siad alerts...") print("\nChecking portal siad alerts...")
try: # Execute siac alerts and read the response
alerts_res = requests.get("http://localhost:9980/daemon/alerts",headers={"User-Agent":"Sia-Agent"}, verify=False) cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME)
alerts_json = alerts_res.json() siac_alert_output = os.popen(cmd_string).read().strip()
except:
trace = traceback.format_exc()
print("[DEBUG] check_alerts() failed.")
return await send_msg(
client, "Failed to run the checks!", file=trace, force_notify=True
)
alerts = alerts_json['alerts'] # Initialize variables
critical_alerts = alerts_json['criticalalerts'] num_critical_alerts = 0
error_alerts = alerts_json['erroralerts'] num_error_alerts = 0
warning_alerts = alerts_json['warningalerts'] num_warning_alerts = 0
num_siafile_alerts = 0
siafile_alerts = [] siafile_alerts = []
# Pattern strings to search for
critical = 'Severity: critical'
error = 'Severity: error'
warning = 'Severity: warning'
health_of = 'has a health of'
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy" siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
# Check for siafile alerts in alerts. This is so that the alert severity # Split the output by line and check for type of alert and siafile alerts
# can change and this doesn't need to be updated for line in siac_alert_output.split("\n"):
for alert in alerts: # Check for the type of alert
if alert['msg'] == siafile_alert_message: if contains_string(line, critical):
siafile_alerts.append(alert) num_critical_alerts += 1
if contains_string(line, error):
num_error_alerts += 1
if contains_string(line, warning):
num_warning_alerts += 1
# Check for siafile alerts in alerts. This is so that the alert
# severity can change and this doesn't need to be updated
if contains_string(line, siafile_alert_message):
num_siafile_alerts += 1
if contains_string(line, health_of):
siafile_alerts.append(line)
################################################################################ ################################################################################
################ create a message # create a message
################################################################################ ################################################################################
message = "" message = ""
force_notify = False force_notify = False
if len(critical_alerts) > 0: if num_critical_alerts > 0:
message += "{} CRITICAL Alerts found! ".format(len(critical_alerts)) message += "{} CRITICAL Alerts found! ".format(num_critical_alerts)
force_notify = True
if len(error_alerts) > 0:
message += "{} Error Alerts found! ".format(len(error_alerts))
force_notify = True force_notify = True
if num_error_alerts > 0:
message += "{} Error Alerts found! ".format(num_error_alerts)
message += "{} Warning Alerts found. ".format(len(warning_alerts)) # Subtract out the siafile alerts from the warning alerts since we announce
message += "{} SiaFiles with bad health found. ".format(len(siafile_alerts)) # them separately
num_warning_alerts -= num_siafile_alerts
message += "{} Warning Alerts found. ".format(num_warning_alerts)
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
alerts_file = None # send a message if we force notification, or just once daily (heartbeat)
if len(alerts) > 0: # on 1 AM
alerts_file = json.dumps(alerts, indent=2) if force_notify or datetime.utcnow().hour == 1:
return await send_msg(
client, message, file=siac_alert_output, force_notify=force_notify
)
# check_portal_size checks the number of files that the portal is managing to
# determine if it is time to rotate it out
async def check_portal_size():
print("\nChecking portal size...")
# Execute siac renter to check the size of the portal
#
# NOTE: we should leave this as always trying to execute the docker command
# against the sia container as this will then fail for maintenance severs
# were we don't care about this check.
cmd_string = "docker exec sia siac renter"
siac_renter_output = os.popen(cmd_string).read().strip()
# Initialize variables
num_files = 0
max_files = 250000
files_text = "Files"
for line in siac_renter_output.split("\n"):
if contains_string(line, files_text):
for el in line.split():
if el.isdigit():
num_files = int(el)
################################################################################
# create a message
################################################################################
message = ""
force_notify = False
if num_files > max_files:
message += "Portal has {} files! Consider rotating! ".format(num_files)
# send notification when above 40% of the limit
force_notify = num_files > max_files * 1.4
else:
message += "Portal has {} files. ".format(num_files)
# send a message if we force notification, or just once daily (heartbeat) on 1 AM # send a message if we force notification, or just once daily (heartbeat) on 1 AM
if force_notify or datetime.utcnow().hour == 1: if force_notify or datetime.utcnow().hour == 1:
return await send_msg( return await send_msg(
client, message, file=alerts_file, force_notify=force_notify client, message, force_notify=force_notify
) )