fixed health check blowing up on eu-fin-3 (#838)
* request 127.0.0.1 over https - http localhost causes issues * reformat with black
This commit is contained in:
parent
d09c372a95
commit
b8a6816876
|
@ -83,8 +83,7 @@ async def check_load_average():
|
||||||
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
pattern = "^.*load average: \d*\.\d*, \d*\.\d*, (\d*\.\d*)$"
|
||||||
load_av = re.match(pattern, uptime_string).group(1)
|
load_av = re.match(pattern, uptime_string).group(1)
|
||||||
if float(load_av) > 10:
|
if float(load_av) > 10:
|
||||||
message = "High system load detected in uptime output: {}".format(
|
message = "High system load detected in uptime output: {}".format(uptime_string)
|
||||||
uptime_string)
|
|
||||||
await send_msg(client, message, force_notify=True)
|
await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -117,7 +116,8 @@ async def check_disk():
|
||||||
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
|
if int(volumes[vol]) < FREE_DISK_SPACE_THRESHOLD_CRITICAL:
|
||||||
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
|
free_space_gb = "{:.2f}".format(int(volumes[vol]) / GB)
|
||||||
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
|
message = "CRITICAL! Very low disk space: {}GiB, **siad stopped**!".format(
|
||||||
free_space_gb)
|
free_space_gb
|
||||||
|
)
|
||||||
inspect = os.popen("docker inspect sia").read().strip()
|
inspect = os.popen("docker inspect sia").read().strip()
|
||||||
inspect_json = json.loads(inspect)
|
inspect_json = json.loads(inspect)
|
||||||
if inspect_json[0]["State"]["Running"] == True:
|
if inspect_json[0]["State"]["Running"] == True:
|
||||||
|
@ -139,13 +139,13 @@ async def check_health():
|
||||||
print("\nChecking portal health status...")
|
print("\nChecking portal health status...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res_check = requests.get("http://localhost/health-check", verify=False)
|
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
|
||||||
json_check = res_check.json()
|
json_check = res_check.json()
|
||||||
json_critical = requests.get(
|
json_critical = requests.get(
|
||||||
"http://localhost/health-check/critical", verify=False
|
"https://127.0.0.1/health-check/critical", verify=False
|
||||||
).json()
|
).json()
|
||||||
json_extended = requests.get(
|
json_extended = requests.get(
|
||||||
"http://localhost/health-check/extended", verify=False
|
"https://127.0.0.1/health-check/extended", verify=False
|
||||||
).json()
|
).json()
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
trace = traceback.format_exc()
|
||||||
|
@ -210,8 +210,7 @@ async def check_health():
|
||||||
)
|
)
|
||||||
force_notify = True
|
force_notify = True
|
||||||
else:
|
else:
|
||||||
message += "All {} critical checks passed. ".format(
|
message += "All {} critical checks passed. ".format(critical_checks_total)
|
||||||
critical_checks_total)
|
|
||||||
|
|
||||||
if extended_checks_failed:
|
if extended_checks_failed:
|
||||||
message += "{}/{} extended checks failed over the last {} hours! ".format(
|
message += "{}/{} extended checks failed over the last {} hours! ".format(
|
||||||
|
@ -219,14 +218,18 @@ async def check_health():
|
||||||
)
|
)
|
||||||
force_notify = True
|
force_notify = True
|
||||||
else:
|
else:
|
||||||
message += "All {} extended checks passed. ".format(
|
message += "All {} extended checks passed. ".format(extended_checks_total)
|
||||||
extended_checks_total)
|
|
||||||
|
|
||||||
if len(failed_records):
|
if len(failed_records):
|
||||||
failed_records_file = json.dumps(failed_records, indent=2)
|
failed_records_file = json.dumps(failed_records, indent=2)
|
||||||
|
|
||||||
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
|
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
|
||||||
if force_notify or json_check["disabled"] or failed_records_file or datetime.utcnow().hour == 1:
|
if (
|
||||||
|
force_notify
|
||||||
|
or json_check["disabled"]
|
||||||
|
or failed_records_file
|
||||||
|
or datetime.utcnow().hour == 1
|
||||||
|
):
|
||||||
return await send_msg(
|
return await send_msg(
|
||||||
client, message, file=failed_records_file, force_notify=force_notify
|
client, message, file=failed_records_file, force_notify=force_notify
|
||||||
)
|
)
|
||||||
|
@ -246,7 +249,7 @@ async def check_alerts():
|
||||||
# parse siac
|
# parse siac
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Alerts
|
# Alerts
|
||||||
# Execute 'siac alerts' and read the response
|
# Execute 'siac alerts' and read the response
|
||||||
cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME)
|
cmd_string = "docker exec {} siac alerts".format(CONTAINER_NAME)
|
||||||
siac_alert_output = os.popen(cmd_string).read().strip()
|
siac_alert_output = os.popen(cmd_string).read().strip()
|
||||||
|
@ -259,11 +262,13 @@ async def check_alerts():
|
||||||
siafile_alerts = []
|
siafile_alerts = []
|
||||||
|
|
||||||
# Pattern strings to search for
|
# Pattern strings to search for
|
||||||
critical = 'Severity: critical'
|
critical = "Severity: critical"
|
||||||
error = 'Severity: error'
|
error = "Severity: error"
|
||||||
warning = 'Severity: warning'
|
warning = "Severity: warning"
|
||||||
health_of = 'has a health of'
|
health_of = "has a health of"
|
||||||
siafile_alert_message = "The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
siafile_alert_message = (
|
||||||
|
"The SiaFile mentioned in the 'Cause' is below 75% redundancy"
|
||||||
|
)
|
||||||
|
|
||||||
# Split the output by line and check for type of alert and siafile alerts
|
# Split the output by line and check for type of alert and siafile alerts
|
||||||
for line in siac_alert_output.split("\n"):
|
for line in siac_alert_output.split("\n"):
|
||||||
|
@ -282,23 +287,23 @@ async def check_alerts():
|
||||||
if contains_string(line, health_of):
|
if contains_string(line, health_of):
|
||||||
siafile_alerts.append(line)
|
siafile_alerts.append(line)
|
||||||
|
|
||||||
# Repair Size
|
# Repair Size
|
||||||
# Execute 'siac renter' and read the response
|
# Execute 'siac renter' and read the response
|
||||||
cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME)
|
cmd_string = "docker exec {} siac renter".format(CONTAINER_NAME)
|
||||||
siac_renter_output = os.popen(cmd_string).read().strip()
|
siac_renter_output = os.popen(cmd_string).read().strip()
|
||||||
|
|
||||||
# Initialize variables
|
# Initialize variables
|
||||||
repair_remaining = ''
|
repair_remaining = ""
|
||||||
|
|
||||||
# Pattern strings to search for
|
# Pattern strings to search for
|
||||||
repair_str = 'Repair Data Remaining'
|
repair_str = "Repair Data Remaining"
|
||||||
|
|
||||||
# Split the output by line and check for the repair remaining
|
# Split the output by line and check for the repair remaining
|
||||||
for line in siac_renter_output.split("\n"):
|
for line in siac_renter_output.split("\n"):
|
||||||
# Check for the type of alert
|
# Check for the type of alert
|
||||||
if contains_string(line, repair_str):
|
if contains_string(line, repair_str):
|
||||||
repair_remaining = line.split(":")[1].strip()
|
repair_remaining = line.split(":")[1].strip()
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
# create a message
|
# create a message
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -317,7 +322,7 @@ async def check_alerts():
|
||||||
num_warning_alerts -= num_siafile_alerts
|
num_warning_alerts -= num_siafile_alerts
|
||||||
message += "{} Warning Alerts found. ".format(num_warning_alerts)
|
message += "{} Warning Alerts found. ".format(num_warning_alerts)
|
||||||
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
|
message += "{} SiaFiles with bad health found. ".format(num_siafile_alerts)
|
||||||
|
|
||||||
# Add repair size
|
# Add repair size
|
||||||
message += "{} of repair remaining. ".format(repair_remaining)
|
message += "{} of repair remaining. ".format(repair_remaining)
|
||||||
|
|
||||||
|
@ -344,7 +349,7 @@ async def check_portal_size():
|
||||||
|
|
||||||
# Initialize variables
|
# Initialize variables
|
||||||
num_files = 0
|
num_files = 0
|
||||||
max_files = 1500000 # 1.5 mln
|
max_files = 1500000 # 1.5 mln
|
||||||
files_text = "Files"
|
files_text = "Files"
|
||||||
for line in siac_renter_output.split("\n"):
|
for line in siac_renter_output.split("\n"):
|
||||||
if contains_string(line, files_text):
|
if contains_string(line, files_text):
|
||||||
|
@ -368,9 +373,7 @@ async def check_portal_size():
|
||||||
|
|
||||||
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
|
# send a message if we force notification, or just once daily (heartbeat) on 1 AM
|
||||||
if force_notify or datetime.utcnow().hour == 1:
|
if force_notify or datetime.utcnow().hour == 1:
|
||||||
return await send_msg(
|
return await send_msg(client, message, force_notify=force_notify)
|
||||||
client, message, force_notify=force_notify
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
client.run(bot_token)
|
client.run(bot_token)
|
||||||
|
|
Reference in New Issue