diff --git a/setup-scripts/health-checker.py b/setup-scripts/health-checker.py index 53b9c94b..a187b5d8 100755 --- a/setup-scripts/health-checker.py +++ b/setup-scripts/health-checker.py @@ -1,6 +1,14 @@ #!/usr/bin/env python3 -import asyncio, json, os, re, sys, traceback, discord, requests, time +import asyncio +import json +import os +import re +import sys +import traceback +import discord +import requests +import time from datetime import datetime, timedelta from bot_utils import setup, send_msg @@ -69,7 +77,8 @@ async def check_load_average(): pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$" load_av = re.match(pattern, uptime_string).group(1) if float(load_av) > 10: - message = "High system load detected in uptime output: {}".format(uptime_string) + message = "High system load detected in uptime output: {}".format( + uptime_string) await send_msg(client, message, force_notify=True) @@ -97,17 +106,19 @@ async def check_disk(): message = "Failed to check free disk space! Didn't find a suitable mount point to check." return await send_msg(client, message, file=df) - # if we've reached a critical free disk space threshold we need to send proper notice + # if we've reached a critical free disk space threshold we need to send proper notice # and shut down sia container so it doesn't get corrupted if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL: free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) - message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(free_space_gb) + message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format( + free_space_gb) inspect = os.popen("docker inspect sia").read().strip() inspect_json = json.loads(inspect) if inspect_json[0]["State"]["Running"] == True: - os.popen("docker exec health-check cli/disable") # mark portal as unhealthy - time.sleep(300) # wait 5 minutes to propagate dns changes - os.popen("docker stop sia") # stop sia container + # mark portal as unhealthy + os.popen("docker exec health-check cli/disable") + time.sleep(300) # wait 5 minutes to propagate dns changes + os.popen("docker stop sia") # stop sia container return await send_msg(client, message, force_notify=True) # if we're reached a free disk space threshold we need to send proper notice @@ -175,7 +186,7 @@ async def check_health(): failed_records.append(verbose) ################################################################################ - ################ create a message + # create a message ################################################################################ message = "" @@ -194,7 +205,8 @@ async def check_health(): ) force_notify = True else: - message += "All {} critical checks passed. ".format(critical_checks_total) + message += "All {} critical checks passed. ".format( + critical_checks_total) if verbose_checks_failed: message += "{}/{} verbose checks failed over the last {} hours! ".format( @@ -202,7 +214,8 @@ async def check_health(): ) force_notify = True else: - message += "All {} verbose checks passed. ".format(verbose_checks_total) + message += "All {} verbose checks passed. ".format( + verbose_checks_total) if len(failed_records): failed_records_file = json.dumps(failed_records, indent=2) @@ -217,9 +230,11 @@ async def check_health(): # contains_string is a simple helper to check if a string contains a string. # This is faster and easier than regex for word comparisons def contains_string(string_to_check, string_to_find): - return string_to_find in string_to_check + return string_to_find in string_to_check # check_alerts checks the alerts returned from siad's daemon/alerts API + + async def check_alerts(): print("\nChecking portal siad alerts...") @@ -231,10 +246,10 @@ async def check_alerts(): siac_alert_output = os.popen(cmd_string).read().strip() # Initialize variables - num_critical_alerts = 0 - num_error_alerts = 0 - num_warning_alerts = 0 - num_siafile_alerts =0 + num_critical_alerts = 0 + num_error_alerts = 0 + num_warning_alerts = 0 + num_siafile_alerts = 0 siafile_alerts = [] # Pattern strings to search for @@ -243,26 +258,26 @@ async def check_alerts(): warning = 'Severity: warning' health_of = 'has a health of' siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy" - + # Split the output by line and check for type of alert and siafile alerts for line in siac_alert_output.split("\n"): - # Check for the type of alert - if contains_string(lin,critical): + # Check for the type of alert + if contains_string(lin, critical): num_critical_alerts++ - if contains_string(lin,error): + if contains_string(lin, error): num_error_alerts++ - if contains_string(lin,warning): + if contains_string(lin, warning): num_warning_alerts++ # Check for siafile alerts in alerts. This is so that the alert # severity can change and this doesn't need to be updated - if contains_string(line,siafile_alert_message): + if contains_string(line, siafile_alert_message): num_siafile_alerts++ - if contains_string(line,health_of) - siafile_alerts.append(line) + if contains_string(line, health_of) + siafile_alerts.append(line) ################################################################################ - ################ create a message + # create a message ################################################################################ message = "" @@ -274,7 +289,7 @@ async def check_alerts(): if num_error_alerts > 0: message += "{} Error Alerts found! ".format(num_error_alerts) force_notify = True - + message += "{} Warning Alerts found. ".format(num_warning_alerts) message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts) @@ -287,6 +302,8 @@ async def check_alerts(): # check_portal_size checks the number of files that the portal is managing to # determine if it is time to rotate it out + + async def check_portal_size(): print("\nChecking portal size...") @@ -303,13 +320,13 @@ async def check_portal_size(): max_files = 250000 files_text = "Files" for line in siac_renter_output.split("\n"): - if contains_string(line,files_text): + if contains_string(line, files_text): for el in line.split(): if el.isdigit(): num_files = int(el) ################################################################################ - ################ create a message + # create a message ################################################################################ message = "" @@ -318,7 +335,7 @@ async def check_portal_size(): if num_files > max_files: message += "Portal has {} files! Consider rotating! ".format(num_files) force_notify = True - else: + else: message += "Portal has {} files. ".format(num_files) # send a message if we force notification, or just once daily (heartbeat) on 1 AM