Merge pull request #977 from SkynetLabs/improve-health-check-reporting
Improve health check reporting
This commit is contained in:
commit
3e516d8a46
|
@ -21,11 +21,11 @@ if len(sys.argv) > 2:
|
|||
CONTAINER_NAME = sys.argv[2]
|
||||
|
||||
# find out local siad ip by inspecting its docker container
|
||||
def get_api_ip():
|
||||
def get_docker_container_ip(container_name):
|
||||
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
|
||||
docker_cmd = (
|
||||
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
|
||||
+ CONTAINER_NAME
|
||||
+ container_name
|
||||
)
|
||||
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
|
||||
return ip_regex.findall(output)[0]
|
||||
|
@ -62,7 +62,7 @@ def setup():
|
|||
port = os.getenv("API_PORT", "9980")
|
||||
|
||||
global api_endpoint
|
||||
api_endpoint = "http://{}:{}".format(get_api_ip(), port)
|
||||
api_endpoint = "http://{}:{}".format(get_docker_container_ip(CONTAINER_NAME), port)
|
||||
|
||||
siad.initialize()
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from datetime import datetime, timedelta
|
|||
|
||||
import discord
|
||||
import requests
|
||||
from bot_utils import setup, send_msg
|
||||
from bot_utils import setup, send_msg, get_docker_container_ip
|
||||
|
||||
"""
|
||||
health-checker reads the /health-check endpoint of the portal and dispatches
|
||||
|
@ -139,19 +139,29 @@ async def check_health():
|
|||
print("\nChecking portal health status...")
|
||||
|
||||
try:
|
||||
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
|
||||
json_check = res_check.json()
|
||||
json_critical = requests.get(
|
||||
"https://127.0.0.1/health-check/critical", verify=False
|
||||
).json()
|
||||
json_extended = requests.get(
|
||||
"https://127.0.0.1/health-check/extended", verify=False
|
||||
).json()
|
||||
endpoint = "http://{}:{}".format(get_docker_container_ip("health-check"), 3100)
|
||||
except:
|
||||
trace = traceback.format_exc()
|
||||
print("[DEBUG] check_health() failed.")
|
||||
message = "Could not get health check service endpoint api!"
|
||||
return await send_msg(client, message, force_notify=True)
|
||||
|
||||
try:
|
||||
res = requests.get(endpoint + "/health-check", verify=False)
|
||||
json_check = res.json()
|
||||
|
||||
server_down = res.status_code is not requests.codes["ok"]
|
||||
|
||||
res = requests.get(endpoint + "/health-check/critical", verify=False)
|
||||
json_critical = res.json()
|
||||
|
||||
res = requests.get(endpoint + "/health-check/extended", verify=False)
|
||||
json_extended = res.json()
|
||||
except:
|
||||
message = traceback.format_exc()
|
||||
message += "\n" + "Request url: " + res.url if res.url else "-"
|
||||
message += "\n" + "Status code: " + str(res.status_code) if res.status_code else "-"
|
||||
message += "\n" + "Response body: " + res.text if res.text else "-"
|
||||
return await send_msg(
|
||||
client, "Failed to run the checks!", file=trace, force_notify=True
|
||||
client, "Failed to run health checks!", file=message, force_notify=True
|
||||
)
|
||||
|
||||
critical_checks_total = 0
|
||||
|
@ -200,7 +210,7 @@ async def check_health():
|
|||
|
||||
if json_check["disabled"]:
|
||||
message += "__Portal manually disabled!__ "
|
||||
elif res_check.status_code is not requests.codes["ok"]:
|
||||
elif server_down:
|
||||
message += "__Portal down!!!__ "
|
||||
force_notify = True
|
||||
|
||||
|
|
Reference in New Issue