fixed health check blowing up on eu-fin-3 (#838)
* request 127.0.0.1 over https - http localhost causes issues * reformat with black
This commit is contained in:
parent
d09c372a95
commit
b8a6816876
|
@ -83,8 +83,7 @@ async def check_load_average():
|
|||
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
||||
load_av = re.match(pattern, uptime_string).group(1)
|
||||
if float(load_av) > 10:
|
||||
message = "High system load detected in uptime output: {}".format(
|
||||
uptime_string)
|
||||
message = "High system load detected in uptime output: {}".format(uptime_string)
|
||||
await send_msg(client, message, force_notify=True)
|
||||
|
||||
|
||||
|
@ -117,7 +116,8 @@ async def check_disk():
|
|||
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
|
||||
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
|
||||
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
|
||||
free_space_gb)
|
||||
free_space_gb
|
||||
)
|
||||
inspect = os.popen("docker inspect sia").read().strip()
|
||||
inspect_json = json.loads(inspect)
|
||||
if inspect_json[0]["State"]["Running"] == True:
|
||||
|
@ -139,13 +139,13 @@ async def check_health():
|
|||
print("\nChecking portal health status...")
|
||||
|
||||
try:
|
||||
res_check = requests.get("http://localhost/health-check", verify=False)
|
||||
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
|
||||
json_check = res_check.json()
|
||||
json_critical = requests.get(
|
||||
"http://localhost/health-check/critical", verify=False
|
||||
"https://127.0.0.1/health-check/critical", verify=False
|
||||
).json()
|
||||
json_extended = requests.get(
|
||||
"http://localhost/health-check/extended", verify=False
|
||||
"https://127.0.0.1/health-check/extended", verify=False
|
||||
).json()
|
||||
except:
|
||||
trace = traceback.format_exc()
|
||||
|
@ -210,8 +210,7 @@ async def check_health():
|
|||
)
|
||||
force_notify = True
|
||||
else:
|
||||
message += "All {} critical checks passed. ".format(
|
||||
critical_checks_total)
|
||||
message += "All {} critical checks passed. ".format(critical_checks_total)
|
||||
|
||||
if extended_checks_failed:
|
||||
message += "{}/{} extended checks failed over the last {} hours! ".format(
|
||||
|
@ -219,14 +218,18 @@ async def check_health():
|
|||
)
|
||||
force_notify = True
|
||||
else:
|
||||
message += "All {} extended checks passed. ".format(
|
||||
extended_checks_total)
|
||||
message += "All {} extended checks passed. ".format(extended_checks_total)
|
||||
|
||||
if len(failed_records):
|
||||
failed_records_file = json.dumps(failed_records, indent=2)
|
||||
|
||||
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
|
||||
if force_notify or json_check["disabled"] or failed_records_file or datetime.utcnow().hour == 1:
|
||||
if (
|
||||
force_notify
|
||||
or json_check["disabled"]
|
||||
or failed_records_file
|
||||
or datetime.utcnow().hour == 1
|
||||
):
|
||||
return await send_msg(
|
||||
client, message, file=failed_records_file, force_notify=force_notify
|
||||
)
|
||||
|
@ -259,11 +262,13 @@ async def check_alerts():
|
|||
siafile_alerts = []
|
||||
|
||||
# Pattern strings to search for
|
||||
critical = 'Severity: critical'
|
||||
error = 'Severity: error'
|
||||
warning = 'Severity: warning'
|
||||
health_of = 'has a health of'
|
||||
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
||||
critical = "Severity: critical"
|
||||
error = "Severity: error"
|
||||
warning = "Severity: warning"
|
||||
health_of = "has a health of"
|
||||
siafile_alert_message = (
|
||||
"The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
||||
)
|
||||
|
||||
# Split the output by line and check for type of alert and siafile alerts
|
||||
for line in siac_alert_output.split("\n"):
|
||||
|
@ -288,10 +293,10 @@ async def check_alerts():
|
|||
siac_renter_output = os.popen(cmd_string).read().strip()
|
||||
|
||||
# Initialize variables
|
||||
repair_remaining = ''
|
||||
repair_remaining = ""
|
||||
|
||||
# Pattern strings to search for
|
||||
repair_str = 'Repair Data Remaining'
|
||||
repair_str = "Repair Data Remaining"
|
||||
|
||||
# Split the output by line and check for the repair remaining
|
||||
for line in siac_renter_output.split("\n"):
|
||||
|
@ -368,9 +373,7 @@ async def check_portal_size():
|
|||
|
||||
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
|
||||
if force_notify or datetime.utcnow().hour == 1:
|
||||
return await send_msg(
|
||||
client, message, force_notify=force_notify
|
||||
)
|
||||
return await send_msg(client, message, force_notify=force_notify)
|
||||
|
||||
|
||||
client.run(bot_token)
|
||||
|
|
Reference in New Issue