fixed health check blowing up on eu-fin-3 (#838)

* request 127.0.0.1 over https - http localhost causes issues

* reformat with black
This commit is contained in:
Karol Wypchło 2021-06-07 15:08:18 +02:00 committed by GitHub
parent d09c372a95
commit b8a6816876
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 30 additions and 27 deletions

View File

@ -83,8 +83,7 @@ async def check_load_average():
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$" pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
load_av = re.match(pattern, uptime_string).group(1) load_av = re.match(pattern, uptime_string).group(1)
if float(load_av) > 10: if float(load_av) > 10:
message = "High system load detected in uptime output: {}".format( message = "High system load detected in uptime output: {}".format(uptime_string)
uptime_string)
await send_msg(client, message, force_notify=True) await send_msg(client, message, force_notify=True)
@ -117,7 +116,8 @@ async def check_disk():
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL: if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB) free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format( message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
free_space_gb) free_space_gb
)
inspect = os.popen("docker inspect sia").read().strip() inspect = os.popen("docker inspect sia").read().strip()
inspect_json = json.loads(inspect) inspect_json = json.loads(inspect)
if inspect_json[0]["State"]["Running"] == True: if inspect_json[0]["State"]["Running"] == True:
@ -139,13 +139,13 @@ async def check_health():
print("\nChecking portal health status...") print("\nChecking portal health status...")
try: try:
res_check = requests.get("http://localhost/health-check", verify=False) res_check = requests.get("https://127.0.0.1/health-check", verify=False)
json_check = res_check.json() json_check = res_check.json()
json_critical = requests.get( json_critical = requests.get(
"http://localhost/health-check/critical", verify=False "https://127.0.0.1/health-check/critical", verify=False
).json() ).json()
json_extended = requests.get( json_extended = requests.get(
"http://localhost/health-check/extended", verify=False "https://127.0.0.1/health-check/extended", verify=False
).json() ).json()
except: except:
trace = traceback.format_exc() trace = traceback.format_exc()
@ -210,8 +210,7 @@ async def check_health():
) )
force_notify = True force_notify = True
else: else:
message += "All {} critical checks passed. ".format( message += "All {} critical checks passed. ".format(critical_checks_total)
critical_checks_total)
if extended_checks_failed: if extended_checks_failed:
message += "{}/{} extended checks failed over the last {} hours! ".format( message += "{}/{} extended checks failed over the last {} hours! ".format(
@ -219,14 +218,18 @@ async def check_health():
) )
force_notify = True force_notify = True
else: else:
message += "All {} extended checks passed. ".format( message += "All {} extended checks passed. ".format(extended_checks_total)
extended_checks_total)
if len(failed_records): if len(failed_records):
failed_records_file = json.dumps(failed_records, indent=2) failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM # send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if force_notify or json_check["disabled"] or failed_records_file or datetime.utcnow().hour == 1: if (
force_notify
or json_check["disabled"]
or failed_records_file
or datetime.utcnow().hour == 1
):
return await send_msg( return await send_msg(
client, message, file=failed_records_file, force_notify=force_notify client, message, file=failed_records_file, force_notify=force_notify
) )
@ -246,7 +249,7 @@ async def check_alerts():
# parse siac # parse siac
################################################################################ ################################################################################
# Alerts # Alerts
# Execute 'siac alerts' and read the response # Execute 'siac alerts' and read the response
cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME) cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME)
siac_alert_output = os.popen(cmd_string).read().strip() siac_alert_output = os.popen(cmd_string).read().strip()
@ -259,11 +262,13 @@ async def check_alerts():
siafile_alerts = [] siafile_alerts = []
# Pattern strings to search for # Pattern strings to search for
critical = 'Severity: critical' critical = "Severity: critical"
error = 'Severity: error' error = "Severity: error"
warning = 'Severity: warning' warning = "Severity: warning"
health_of = 'has a health of' health_of = "has a health of"
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy" siafile_alert_message = (
"The SiaFile mentioned in the 'Cause' is below 75% redundancy"
)
# Split the output by line and check for type of alert and siafile alerts # Split the output by line and check for type of alert and siafile alerts
for line in siac_alert_output.split("\n"): for line in siac_alert_output.split("\n"):
@ -282,23 +287,23 @@ async def check_alerts():
if contains_string(line, health_of): if contains_string(line, health_of):
siafile_alerts.append(line) siafile_alerts.append(line)
# Repair Size # Repair Size
# Execute 'siac renter' and read the response # Execute 'siac renter' and read the response
cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME) cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME)
siac_renter_output = os.popen(cmd_string).read().strip() siac_renter_output = os.popen(cmd_string).read().strip()
# Initialize variables # Initialize variables
repair_remaining = '' repair_remaining = ""
# Pattern strings to search for # Pattern strings to search for
repair_str = 'Repair Data Remaining' repair_str = "Repair Data Remaining"
# Split the output by line and check for the repair remaining # Split the output by line and check for the repair remaining
for line in siac_renter_output.split("\n"): for line in siac_renter_output.split("\n"):
# Check for the type of alert # Check for the type of alert
if contains_string(line, repair_str): if contains_string(line, repair_str):
repair_remaining = line.split(":")[1].strip() repair_remaining = line.split(":")[1].strip()
################################################################################ ################################################################################
# create a message # create a message
################################################################################ ################################################################################
@ -317,7 +322,7 @@ async def check_alerts():
num_warning_alerts -= num_siafile_alerts num_warning_alerts -= num_siafile_alerts
message += "{} Warning Alerts found. ".format(num_warning_alerts) message += "{} Warning Alerts found. ".format(num_warning_alerts)
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts) message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
# Add repair size # Add repair size
message += "{} of repair remaining. ".format(repair_remaining) message += "{} of repair remaining. ".format(repair_remaining)
@ -344,7 +349,7 @@ async def check_portal_size():
# Initialize variables # Initialize variables
num_files = 0 num_files = 0
max_files = 1500000 # 1.5 mln max_files = 1500000 # 1.5 mln
files_text = "Files" files_text = "Files"
for line in siac_renter_output.split("\n"): for line in siac_renter_output.split("\n"):
if contains_string(line, files_text): if contains_string(line, files_text):
@ -368,9 +373,7 @@ async def check_portal_size():
# send a message if we force notification, or just once daily (heartbeat) on 1 AM # send a message if we force notification, or just once daily (heartbeat) on 1 AM
if force_notify or datetime.utcnow().hour == 1: if force_notify or datetime.utcnow().hour == 1:
return await send_msg( return await send_msg(client, message, force_notify=force_notify)
client, message, force_notify=force_notify
)
client.run(bot_token) client.run(bot_token)