diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index 2466c672..53b9c94b 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -47,6 +47,7 @@ async def run_checks(): await check_disk() await check_health() await check_alerts() + await check_portal_size() except: trace = traceback.format_exc() print("[DEBUG] run_checks() failed.") @@ -78,6 +79,7 @@ async def check_disk(): # We check free disk space in 1024 byte units, so it's easy to convert. df = os.popen("df --block-size=1024").read().strip() volumes = {} + # Iterate over the output, ignoring the header line for line in df.split("\n")[1:]: fields = list(filter(None, line.split(" "))) # -1 is "mounted on", 3 is "available space" in KiB which we want in bytes @@ -211,33 +213,53 @@ async def check_health(): client, message, file=failed_records_file, force_notify=force_notify ) + +# contains_string is a simple helper to check if a string contains a string. +# This is faster and easier than regex for word comparisons +def contains_string(string_to_check, string_to_find): + return string_to_find in string_to_check + # check_alerts checks the alerts returned from siad's daemon/alerts API async def check_alerts(): print("\nChecking portal siad alerts...") - try: - alerts_res = requests.get("http://localhost:9980/daemon/alerts",headers={"User-Agent":"Sia-Agent"}, verify=False) - alerts_json = alerts_res.json() - except: - trace = traceback.format_exc() - print("[DEBUG] check_alerts() failed.") - return await send_msg( - client, "Failed to run the checks!", file=trace, force_notify=True - ) + # Execute siac alerts and read the response + # TODO: is the container name always `sia` for production servers? Is it + # only changed to the server name when it it is moved to Maintenance? Will + # this just never check the alerts on the maintenance servers? + cmd_string = "docker exec sia siac alert" + siac_alert_output = os.popen(cmd_string).read().strip() - alerts = alerts_json['alerts'] - critical_alerts = alerts_json['criticalalerts'] - error_alerts = alerts_json['erroralerts'] - warning_alerts = alerts_json['warningalerts'] + # Initialize variables + num_critical_alerts = 0 + num_error_alerts = 0 + num_warning_alerts = 0 + num_siafile_alerts =0 siafile_alerts = [] + + # Pattern strings to search for + critical = 'Severity: critical' + error = 'Severity: error' + warning = 'Severity: warning' + health_of = 'has a health of' siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy" + + # Split the output by line and check for type of alert and siafile alerts + for line in siac_alert_output.split("\n"): + # Check for the type of alert + if contains_string(lin,critical): + num_critical_alerts++ + if contains_string(lin,error): + num_error_alerts++ + if contains_string(lin,warning): + num_warning_alerts++ - # Check for siafile alerts in alerts. This is so that the alert severity - # can change and this doesn't need to be updated - for alert in alerts: - if alert['msg'] == siafile_alert_message: - siafile_alerts.append(alert) - + # Check for siafile alerts in alerts. This is so that the alert + # severity can change and this doesn't need to be updated + if contains_string(line,siafile_alert_message): + num_siafile_alerts++ + if contains_string(line,health_of) + siafile_alerts.append(line) ################################################################################ ################ create a message @@ -246,25 +268,63 @@ async def check_alerts(): message = "" force_notify = False - if len(critical_alerts) > 0: - message += "{} CRITICAL Alerts found! ".format(len(critical_alerts)) + if num_critical_alerts > 0: + message += "{} CRITICAL Alerts found! ".format(num_critical_alerts) force_notify = True - if len(error_alerts) > 0: - message += "{} Error Alerts found! ".format(len(error_alerts)) + if num_error_alerts > 0: + message += "{} Error Alerts found! ".format(num_error_alerts) force_notify = True - message += "{} Warning Alerts found. ".format(len(warning_alerts)) - message += "{} SiaFiles with bad health found. ".format(len(siafile_alerts)) + message += "{} Warning Alerts found. ".format(num_warning_alerts) + message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts) - alerts_file = None - if len(alerts) > 0: - alerts_file = json.dumps(alerts, indent=2) + # send a message if we force notification, or just once daily (heartbeat) + # on 1 AM + if force_notify or datetime.utcnow().hour == 1: + return await send_msg( + client, message, file=siac_alert_output, force_notify=force_notify + ) + +# check_portal_size checks the number of files that the portal is managing to +# determine if it is time to rotate it out +async def check_portal_size(): + print("\nChecking portal size...") + + # Execute siac renter to check the size of the portal + # + # NOTE: we should leave this as always trying to execute the docker command + # against the sia container as this will then fail for maintenance severs + # were we don't care about this check. + cmd_string = "docker exec sia siac renter" + siac_renter_output = os.popen(cmd_string).read().strip() + + # Initialize variables + num_files = 0 + max_files = 250000 + files_text = "Files" + for line in siac_renter_output.split("\n"): + if contains_string(line,files_text): + for el in line.split(): + if el.isdigit(): + num_files = int(el) + + ################################################################################ + ################ create a message + ################################################################################ + + message = "" + force_notify = False + + if num_files > max_files: + message += "Portal has {} files! Consider rotating! ".format(num_files) + force_notify = True + else: + message += "Portal has {} files. ".format(num_files) # send a message if we force notification, or just once daily (heartbeat) on 1 AM if force_notify or datetime.utcnow().hour == 1: return await send_msg( - client, message, file=alerts_file, force_notify=force_notify + client, message, force_notify=force_notify ) - client.run(bot_token)