Merge pull request #977 from SkynetLabs/improve-health-check-reporting

Improve health check reporting
This commit is contained in:
Christopher Schinnerl 2021-07-12 15:51:08 +02:00 committed by GitHub
commit 3e516d8a46
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 16 deletions

View File

@ -21,11 +21,11 @@ if len(sys.argv) > 2:
CONTAINER_NAME = sys.argv[2]
# find out local siad ip by inspecting its docker container
def get_api_ip():
def get_docker_container_ip(container_name):
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
docker_cmd = (
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
+ CONTAINER_NAME
+ container_name
)
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
return ip_regex.findall(output)[0]
@ -62,7 +62,7 @@ def setup():
port = os.getenv("API_PORT", "9980")
global api_endpoint
api_endpoint = "http://{}:{}".format(get_api_ip(), port)
api_endpoint = "http://{}:{}".format(get_docker_container_ip(CONTAINER_NAME), port)
siad.initialize()

View File

@ -11,7 +11,7 @@ from datetime import datetime, timedelta
import discord
import requests
from bot_utils import setup, send_msg
from bot_utils import setup, send_msg, get_docker_container_ip
"""
health-checker reads the /health-check endpoint of the portal and dispatches
@ -139,19 +139,29 @@ async def check_health():
print("\nChecking portal health status...")
try:
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
json_check = res_check.json()
json_critical = requests.get(
"https://127.0.0.1/health-check/critical", verify=False
).json()
json_extended = requests.get(
"https://127.0.0.1/health-check/extended", verify=False
).json()
endpoint = "http://{}:{}".format(get_docker_container_ip("health-check"), 3100)
except:
trace = traceback.format_exc()
print("[DEBUG] check_health() failed.")
message = "Could not get health check service endpoint api!"
return await send_msg(client, message, force_notify=True)
try:
res = requests.get(endpoint + "/health-check", verify=False)
json_check = res.json()
server_down = res.status_code is not requests.codes["ok"]
res = requests.get(endpoint + "/health-check/critical", verify=False)
json_critical = res.json()
res = requests.get(endpoint + "/health-check/extended", verify=False)
json_extended = res.json()
except:
message = traceback.format_exc()
message += "\n" + "Request url: " + res.url if res.url else "-"
message += "\n" + "Status code: " + str(res.status_code) if res.status_code else "-"
message += "\n" + "Response body: " + res.text if res.text else "-"
return await send_msg(
client, "Failed to run the checks!", file=trace, force_notify=True
client, "Failed to run health checks!", file=message, force_notify=True
)
critical_checks_total = 0
@ -200,7 +210,7 @@ async def check_health():
if json_check["disabled"]:
message += "__Portal manually disabled!__ "
elif res_check.status_code is not requests.codes["ok"]:
elif server_down:
message += "__Portal down!!!__ "
force_notify = True