scripts: update file health check to check siac output. Add total files check
This commit is contained in:
parent
7bfa2c8e60
commit
efc6060924
|
@ -47,6 +47,7 @@ async def run_checks():
|
||||||
await check_disk()
|
await check_disk()
|
||||||
await check_health()
|
await check_health()
|
||||||
await check_alerts()
|
await check_alerts()
|
||||||
|
await check_portal_size()
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
print("[DEBUG] run_checks() failed.")
|
print("[DEBUG] run_checks() failed.")
|
||||||
|
@ -78,6 +79,7 @@ async def check_disk():
|
||||||
# We check free disk space in 1024 byte units, so it's easy to convert.
|
# We check free disk space in 1024 byte units, so it's easy to convert.
|
||||||
df = os.popen("df --block-size=1024").read().strip()
|
df = os.popen("df --block-size=1024").read().strip()
|
||||||
volumes = {}
|
volumes = {}
|
||||||
|
# Iterate over the output, ignoring the header line
|
||||||
for line in df.split("\n")[1:]:
|
for line in df.split("\n")[1:]:
|
||||||
fields = list(filter(None, line.split(" ")))
|
fields = list(filter(None, line.split(" ")))
|
||||||
# -1 is "mounted on", 3 is "available space" in KiB which we want in bytes
|
# -1 is "mounted on", 3 is "available space" in KiB which we want in bytes
|
||||||
|
@ -211,33 +213,53 @@ async def check_health():
|
||||||
client, message, file=failed_records_file, force_notify=force_notify
|
client, message, file=failed_records_file, force_notify=force_notify
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# contains_string is a simple helper to check if a string contains a string.
|
||||||
|
# This is faster and easier than regex for word comparisons
|
||||||
|
def contains_string(string_to_check, string_to_find):
|
||||||
|
return string_to_find in string_to_check
|
||||||
|
|
||||||
# check_alerts checks the alerts returned from siad's daemon/alerts API
|
# check_alerts checks the alerts returned from siad's daemon/alerts API
|
||||||
async def check_alerts():
|
async def check_alerts():
|
||||||
print("\nChecking portal siad alerts...")
|
print("\nChecking portal siad alerts...")
|
||||||
|
|
||||||
try:
|
# Execute siac alerts and read the response
|
||||||
alerts_res = requests.get("http://localhost:9980/daemon/alerts",headers={"User-Agent":"Sia-Agent"}, verify=False)
|
# TODO: is the container name always `sia` for production servers? Is it
|
||||||
alerts_json = alerts_res.json()
|
# only changed to the server name when it it is moved to Maintenance? Will
|
||||||
except:
|
# this just never check the alerts on the maintenance servers?
|
||||||
trace = traceback.format_exc()
|
cmd_string = "docker exec sia siac alert"
|
||||||
print("[DEBUG] check_alerts() failed.")
|
siac_alert_output = os.popen(cmd_string).read().strip()
|
||||||
return await send_msg(
|
|
||||||
client, "Failed to run the checks!", file=trace, force_notify=True
|
|
||||||
)
|
|
||||||
|
|
||||||
alerts = alerts_json['alerts']
|
# Initialize variables
|
||||||
critical_alerts = alerts_json['criticalalerts']
|
num_critical_alerts = 0
|
||||||
error_alerts = alerts_json['erroralerts']
|
num_error_alerts = 0
|
||||||
warning_alerts = alerts_json['warningalerts']
|
num_warning_alerts = 0
|
||||||
|
num_siafile_alerts =0
|
||||||
siafile_alerts = []
|
siafile_alerts = []
|
||||||
|
|
||||||
|
# Pattern strings to search for
|
||||||
|
critical = 'Severity: critical'
|
||||||
|
error = 'Severity: error'
|
||||||
|
warning = 'Severity: warning'
|
||||||
|
health_of = 'has a health of'
|
||||||
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
||||||
|
|
||||||
# Check for siafile alerts in alerts. This is so that the alert severity
|
# Split the output by line and check for type of alert and siafile alerts
|
||||||
# can change and this doesn't need to be updated
|
for line in siac_alert_output.split("\n"):
|
||||||
for alert in alerts:
|
# Check for the type of alert
|
||||||
if alert['msg'] == siafile_alert_message:
|
if contains_string(lin,critical):
|
||||||
siafile_alerts.append(alert)
|
num_critical_alerts++
|
||||||
|
if contains_string(lin,error):
|
||||||
|
num_error_alerts++
|
||||||
|
if contains_string(lin,warning):
|
||||||
|
num_warning_alerts++
|
||||||
|
|
||||||
|
# Check for siafile alerts in alerts. This is so that the alert
|
||||||
|
# severity can change and this doesn't need to be updated
|
||||||
|
if contains_string(line,siafile_alert_message):
|
||||||
|
num_siafile_alerts++
|
||||||
|
if contains_string(line,health_of)
|
||||||
|
siafile_alerts.append(line)
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
################ create a message
|
################ create a message
|
||||||
|
@ -246,25 +268,63 @@ async def check_alerts():
|
||||||
message = ""
|
message = ""
|
||||||
force_notify = False
|
force_notify = False
|
||||||
|
|
||||||
if len(critical_alerts) > 0:
|
if num_critical_alerts > 0:
|
||||||
message += "{} CRITICAL Alerts found! ".format(len(critical_alerts))
|
message += "{} CRITICAL Alerts found! ".format(num_critical_alerts)
|
||||||
force_notify = True
|
force_notify = True
|
||||||
if len(error_alerts) > 0:
|
if num_error_alerts > 0:
|
||||||
message += "{} Error Alerts found! ".format(len(error_alerts))
|
message += "{} Error Alerts found! ".format(num_error_alerts)
|
||||||
force_notify = True
|
force_notify = True
|
||||||
|
|
||||||
message += "{} Warning Alerts found. ".format(len(warning_alerts))
|
message += "{} Warning Alerts found. ".format(num_warning_alerts)
|
||||||
message += "{} SiaFiles with bad health found. ".format(len(siafile_alerts))
|
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
|
||||||
|
|
||||||
alerts_file = None
|
# send a message if we force notification, or just once daily (heartbeat)
|
||||||
if len(alerts) > 0:
|
# on 1 AM
|
||||||
alerts_file = json.dumps(alerts, indent=2)
|
if force_notify or datetime.utcnow().hour == 1:
|
||||||
|
return await send_msg(
|
||||||
|
client, message, file=siac_alert_output, force_notify=force_notify
|
||||||
|
)
|
||||||
|
|
||||||
|
# check_portal_size checks the number of files that the portal is managing to
|
||||||
|
# determine if it is time to rotate it out
|
||||||
|
async def check_portal_size():
|
||||||
|
print("\nChecking portal size...")
|
||||||
|
|
||||||
|
# Execute siac renter to check the size of the portal
|
||||||
|
#
|
||||||
|
# NOTE: we should leave this as always trying to execute the docker command
|
||||||
|
# against the sia container as this will then fail for maintenance severs
|
||||||
|
# were we don't care about this check.
|
||||||
|
cmd_string = "docker exec sia siac renter"
|
||||||
|
siac_renter_output = os.popen(cmd_string).read().strip()
|
||||||
|
|
||||||
|
# Initialize variables
|
||||||
|
num_files = 0
|
||||||
|
max_files = 250000
|
||||||
|
files_text = "Files"
|
||||||
|
for line in siac_renter_output.split("\n"):
|
||||||
|
if contains_string(line,files_text):
|
||||||
|
for el in line.split():
|
||||||
|
if el.isdigit():
|
||||||
|
num_files = int(el)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
################ create a message
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
message = ""
|
||||||
|
force_notify = False
|
||||||
|
|
||||||
|
if num_files > max_files:
|
||||||
|
message += "Portal has {} files! Consider rotating! ".format(num_files)
|
||||||
|
force_notify = True
|
||||||
|
else:
|
||||||
|
message += "Portal has {} files. ".format(num_files)
|
||||||
|
|
||||||
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
|
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
|
||||||
if force_notify or datetime.utcnow().hour == 1:
|
if force_notify or datetime.utcnow().hour == 1:
|
||||||
return await send_msg(
|
return await send_msg(
|
||||||
client, message, file=alerts_file, force_notify=force_notify
|
client, message, force_notify=force_notify
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
Reference in New Issue