#!/usr/bin/env python3 import asyncio import json import os import re import sys import time import traceback from datetime import datetime, timedelta import requests from bot_utils import setup, send_msg, get_docker_container_id, get_docker_container_ip """ health-checker reads the /health-check endpoint of the portal and dispatches messages to a Discord channel. """ # Get the container name as an argument or use "sia" as default. CONTAINER_NAME = "sia" if len(sys.argv) > 2: CONTAINER_NAME = sys.argv[2] # Get the number of hours to look back in the logs or use 1 as default. CHECK_HOURS = 1 if len(sys.argv) > 3: CHECK_HOURS = int(sys.argv[3]) # Discord messages have a limit on their length set at 2000 bytes. We use # a lower limit in order to leave some space for additional message text. DISCORD_MAX_MESSAGE_LENGTH = 1900 GB = 1 << 30 # 1 GiB in bytes # Free disk space threshold used for notices and shutting down siad. FREE_DISK_SPACE_THRESHOLD = 100 * GB FREE_DISK_SPACE_THRESHOLD_CRITICAL = 60 * GB # Disk usage dump log file (relative to this .py script). DISK_USAGE_DUMP_LOG = "../../devops/disk-monitor/disk-usage-dump.log" setup() # run_checks runs the health checks async def run_checks(): print("Running Skynet portal health checks") try: await check_load_average() await check_disk() await check_health() await check_alerts() await check_portal_size() except: trace = traceback.format_exc() print("[DEBUG] run_checks() failed.") await send_msg( "Failed to run the portal health checks!", file=trace, force_notify=True, ) # check_load_average monitors the system load average value and issues a # warning message if it exceeds 10. async def check_load_average(): uptime_string = os.popen("uptime").read().strip() if sys.platform == "Darwin": pattern = "^.*load averages: \d*\.\d* \d*\.\d* (\d*\.\d*)$" else: pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$" load_av = re.match(pattern, uptime_string).group(1) if float(load_av) > 10: message = "High system load detected in uptime output: {}".format(uptime_string) # Disabling pings until we have metrics solution and process to better # address await send_msg(message, force_notify=False) # check_disk checks the amount of free space on the /home partition and issues # a warning message if it's under FREE_DISK_SPACE_THRESHOLD GB. async def check_disk(): # We check free disk space in 1024 byte units, so it's easy to convert. df = os.popen("df --block-size=1024").read().strip() volumes = {} # Iterate over the output, ignoring the header line for line in df.split("\n")[1:]: fields = list(filter(None, line.split(" "))) # -1 is "mounted on", 3 is "available space" in KiB which we want in bytes volumes[fields[-1]] = int(fields[3]) * 1024 # List of mount point, longest to shortest. We'll use that to find the best # fit for the volume we want to check. mount_points = sorted(volumes.keys(), key=len, reverse=True) wd = os.popen("pwd").read().strip() vol = "" for mp in mount_points: if wd.startswith(mp): vol = mp break if vol == "": message = "Failed to check free disk space! Didn't find a suitable mount point to check." return await send_msg(message, file=df) # if we've reached a critical free disk space threshold we need to send proper notice # and shut down sia container so it doesn't get corrupted if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL: free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format( free_space_gb ) # dump disk usage script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) os.popen( script_dir + "/disk-usage-dump.sh " + script_dir + "/" + DISK_USAGE_DUMP_LOG ) inspect = os.popen("docker inspect sia").read().strip() inspect_json = json.loads(inspect) if inspect_json[0]["State"]["Running"] is True: # mark portal as unhealthy os.popen("docker exec health-check cli disable 'critical free disk space'") time.sleep(300) # wait 5 minutes to propagate dns changes os.popen("docker stop sia") # stop sia container return await send_msg(message, force_notify=True) # if we're reached a free disk space threshold we need to send proper notice if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD: free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) message = "WARNING! Low disk space: {}GiB".format(free_space_gb) return await send_msg(message, force_notify=True) # check_health checks /health-check endpoint and reports recent issues async def check_health(): print("\nChecking portal health status...") # do not try to run health checks if health-check container does not exist # possible use case is fresh or taken down server that has only skyd running if not get_docker_container_id("health-check"): print("Container health-check not found - skipping health checks") return try: endpoint = "http://{}:{}".format(get_docker_container_ip("health-check"), 3100) except: message = "Could not get health check service endpoint api!" return await send_msg(message, force_notify=True) try: res = requests.get(endpoint + "/health-check", verify=False) json_check = res.json() server_failure = ( res.status_code is not requests.codes["ok"] and json_check["disabled"] is False ) res = requests.get(endpoint + "/health-check/critical", verify=False) json_critical = res.json() res = requests.get(endpoint + "/health-check/extended", verify=False) json_extended = res.json() except: message = traceback.format_exc() message += "\n" + "Request url: " + res.url if res.url else "-" message += ( "\n" + "Status code: " + str(res.status_code) if res.status_code else "-" ) message += "\n" + "Response body: " + res.text if res.text else "-" return await send_msg( "Failed to run health checks!", file=message, force_notify=True ) critical_checks_total = 0 critical_checks_failed = 0 extended_checks_total = 0 extended_checks_failed = 0 failed_records = [] failed_records_file = None time_limit = datetime.utcnow().replace( minute=0, second=0, microsecond=0 ) - timedelta(hours=CHECK_HOURS) for critical in json_critical: time = datetime.strptime(critical["date"], "%Y-%m-%dT%H:%M:%S.%fZ") if time < time_limit: continue bad = False for check in critical["checks"]: critical_checks_total += 1 if check["up"] is False: critical_checks_failed += 1 bad = True if bad: critical["checks"] = [ check for check in critical["checks"] if check["up"] is False ] failed_records.append(critical) for extended in json_extended: time = datetime.strptime(extended["date"], "%Y-%m-%dT%H:%M:%S.%fZ") if time < time_limit: continue bad = False for check in extended["checks"]: extended_checks_total += 1 if check["up"] is False: extended_checks_failed += 1 bad = True if bad: extended["checks"] = [ check for check in extended["checks"] if check["up"] is False ] failed_records.append(extended) ################################################################################ # create a message ################################################################################ message = "" force_notify = False if server_failure: message += "__Server down!!!__ " force_notify = True if critical_checks_failed: message += "{}/{} CRITICAL checks failed over the last {} hours! ".format( critical_checks_failed, critical_checks_total, CHECK_HOURS ) # Disabling as it creates notification fatigue. # force_notify = True else: message += "All {} critical checks passed. ".format(critical_checks_total) if extended_checks_failed: message += "{}/{} extended checks failed over the last {} hours! ".format( extended_checks_failed, extended_checks_total, CHECK_HOURS ) # Disabling as it creates notification fatigue. # force_notify = True else: message += "All {} extended checks passed. ".format(extended_checks_total) if len(failed_records): failed_records_file = json.dumps(failed_records, indent=2) # send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM if force_notify or failed_records_file or datetime.utcnow().hour == 1: return await send_msg( message, file=failed_records_file, force_notify=force_notify ) # contains_string is a simple helper to check if a string contains a string. # This is faster and easier than regex for word comparisons def contains_string(string_to_check, string_to_find): return string_to_find in string_to_check # check_alerts checks the alerts returned from siad's daemon/alerts API async def check_alerts(): print("\nChecking portal siad alerts...") ################################################################################ # parse siac ################################################################################ # Alerts # Execute 'siac alerts' and read the response cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME) siac_alert_output = os.popen(cmd_string).read().strip() # Initialize variables num_critical_alerts = 0 num_error_alerts = 0 num_warning_alerts = 0 num_siafile_alerts = 0 siafile_alerts = [] # Pattern strings to search for critical = "Severity: critical" error = "Severity: error" warning = "Severity: warning" health_of = "has a health of" siafile_alert_message = ( "The SiaFile mentioned in the 'Cause' is below 75% redundancy" ) # Split the output by line and check for type of alert and siafile alerts for line in siac_alert_output.split("\n"): # Check for the type of alert if contains_string(line, critical): num_critical_alerts += 1 if contains_string(line, error): num_error_alerts += 1 if contains_string(line, warning): num_warning_alerts += 1 # Check for siafile alerts in alerts. This is so that the alert # severity can change and this doesn't need to be updated if contains_string(line, siafile_alert_message): num_siafile_alerts += 1 if contains_string(line, health_of): siafile_alerts.append(line) # Repair Size # Execute 'siac renter' and read the response cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME) siac_renter_output = os.popen(cmd_string).read().strip() # Initialize variables repair_remaining = "" # Pattern strings to search for repair_str = "Repair Data Remaining" # Split the output by line and check for the repair remaining for line in siac_renter_output.split("\n"): # Check for the type of alert if contains_string(line, repair_str): repair_remaining = line.split(":")[1].strip() ################################################################################ # create a message ################################################################################ message = "" force_notify = False if num_critical_alerts > 0: message += "{} CRITICAL Alerts found! ".format(num_critical_alerts) force_notify = True if num_error_alerts > 0: message += "{} Error Alerts found! ".format(num_error_alerts) # Subtract out the siafile alerts from the warning alerts since we announce # them separately num_warning_alerts -= num_siafile_alerts message += "{} Warning Alerts found. ".format(num_warning_alerts) message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts) # Add repair size message += "{} of repair remaining. ".format(repair_remaining) # send a message if we force notification, or just once daily (heartbeat) # on 1 AM if force_notify or datetime.utcnow().hour == 1: return await send_msg( message, file=siac_alert_output, force_notify=force_notify ) # check_portal_size checks the number of files that the portal is managing to # determine if it is time to rotate it out async def check_portal_size(): print("\nChecking portal size...") # Execute siac renter to check the size of the portal # # NOTE: we should leave this as always trying to execute the docker command # against the sia container as this will then fail for maintenance severs # were we don't care about this check. cmd_string = "docker exec sia siac renter" siac_renter_output = os.popen(cmd_string).read().strip() # Initialize variables num_files = 0 max_files = 1500000 # 1.5 mln files_text = "Files:" for line in siac_renter_output.split("\n"): if line.strip().startswith(files_text): for el in line.split(): if el.isdigit(): num_files = int(el) ################################################################################ # create a message ################################################################################ message = "" force_notify = False if num_files > max_files: message += "Portal has {} files! Consider rotating! ".format(num_files) # send notification when above 40% of the limit force_notify = num_files > max_files * 1.4 else: message += "Portal has {} files. ".format(num_files) # send a message if we force notification, or just once daily (heartbeat) on 1 AM if force_notify or datetime.utcnow().hour == 1: return await send_msg(message, force_notify=force_notify) loop = asyncio.get_event_loop() loop.run_until_complete(run_checks())