fixed health check blowing up on eu-fin-3 (#838)

* request 127.0.0.1 over https - http localhost causes issues

* reformat with black
This commit is contained in:
Karol Wypchło 2021-06-07 15:08:18 +02:00 committed by GitHub
parent d09c372a95
commit b8a6816876
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 30 additions and 27 deletions

View File

@ -83,8 +83,7 @@ async def check_load_average():
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10:
message = "High system load detected in uptime output: {}".format(
uptime_string)
message = "High system load detected in uptime output: {}".format(uptime_string)
await send_msg(client, message, force_notify=True)
@ -117,7 +116,8 @@ async def check_disk():
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
free_space_gb)
free_space_gb
)
inspect = os.popen("docker inspect sia").read().strip()
inspect_json = json.loads(inspect)
if inspect_json[0]["State"]["Running"] == True:
@ -139,13 +139,13 @@ async def check_health():
print("\nChecking portal health status...")
try:
res_check = requests.get("http://localhost/health-check", verify=False)
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
json_check = res_check.json()
json_critical = requests.get(
"http://localhost/health-check/critical", verify=False
"https://127.0.0.1/health-check/critical", verify=False
).json()
json_extended = requests.get(
"http://localhost/health-check/extended", verify=False
"https://127.0.0.1/health-check/extended", verify=False
).json()
except:
trace = traceback.format_exc()
@ -210,8 +210,7 @@ async def check_health():
)
force_notify = True
else:
message += "All {} critical checks passed. ".format(
critical_checks_total)
message += "All {} critical checks passed. ".format(critical_checks_total)
if extended_checks_failed:
message += "{}/{} extended checks failed over the last {} hours! ".format(
@ -219,14 +218,18 @@ async def check_health():
)
force_notify = True
else:
message += "All {} extended checks passed. ".format(
extended_checks_total)
message += "All {} extended checks passed. ".format(extended_checks_total)
if len(failed_records):
failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if force_notify or json_check["disabled"] or failed_records_file or datetime.utcnow().hour == 1:
if (
force_notify
or json_check["disabled"]
or failed_records_file
or datetime.utcnow().hour == 1
):
return await send_msg(
client, message, file=failed_records_file, force_notify=force_notify
)
@ -246,7 +249,7 @@ async def check_alerts():
# parse siac
################################################################################
# Alerts
# Alerts
# Execute 'siac alerts' and read the response
cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME)
siac_alert_output = os.popen(cmd_string).read().strip()
@ -259,11 +262,13 @@ async def check_alerts():
siafile_alerts = []
# Pattern strings to search for
critical = 'Severity: critical'
error = 'Severity: error'
warning = 'Severity: warning'
health_of = 'has a health of'
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
critical = "Severity: critical"
error = "Severity: error"
warning = "Severity: warning"
health_of = "has a health of"
siafile_alert_message = (
"The SiaFile mentioned in the 'Cause' is below 75% redundancy"
)
# Split the output by line and check for type of alert and siafile alerts
for line in siac_alert_output.split("\n"):
@ -282,23 +287,23 @@ async def check_alerts():
if contains_string(line, health_of):
siafile_alerts.append(line)
# Repair Size
# Repair Size
# Execute 'siac renter' and read the response
cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME)
siac_renter_output = os.popen(cmd_string).read().strip()
# Initialize variables
repair_remaining = ''
repair_remaining = ""
# Pattern strings to search for
repair_str = 'Repair Data Remaining'
repair_str = "Repair Data Remaining"
# Split the output by line and check for the repair remaining
for line in siac_renter_output.split("\n"):
# Check for the type of alert
if contains_string(line, repair_str):
repair_remaining = line.split(":")[1].strip()
################################################################################
# create a message
################################################################################
@ -317,7 +322,7 @@ async def check_alerts():
num_warning_alerts -= num_siafile_alerts
message += "{} Warning Alerts found. ".format(num_warning_alerts)
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
# Add repair size
message += "{} of repair remaining. ".format(repair_remaining)
@ -344,7 +349,7 @@ async def check_portal_size():
# Initialize variables
num_files = 0
max_files = 1500000 # 1.5 mln
max_files = 1500000 # 1.5 mln
files_text = "Files"
for line in siac_renter_output.split("\n"):
if contains_string(line, files_text):
@ -368,9 +373,7 @@ async def check_portal_size():
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
if force_notify or datetime.utcnow().hour == 1:
return await send_msg(
client, message, force_notify=force_notify
)
return await send_msg(client, message, force_notify=force_notify)
client.run(bot_token)