Merge pull request #977 from SkynetLabs/improve-health-check-reporting
Improve health check reporting
This commit is contained in:
commit
3e516d8a46
|
@ -21,11 +21,11 @@ if len(sys.argv) > 2:
|
||||||
CONTAINER_NAME = sys.argv[2]
|
CONTAINER_NAME = sys.argv[2]
|
||||||
|
|
||||||
# find out local siad ip by inspecting its docker container
|
# find out local siad ip by inspecting its docker container
|
||||||
def get_api_ip():
|
def get_docker_container_ip(container_name):
|
||||||
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
|
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
|
||||||
docker_cmd = (
|
docker_cmd = (
|
||||||
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
|
"docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "
|
||||||
+ CONTAINER_NAME
|
+ container_name
|
||||||
)
|
)
|
||||||
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
|
output = subprocess.check_output(docker_cmd, shell=True).decode("utf-8")
|
||||||
return ip_regex.findall(output)[0]
|
return ip_regex.findall(output)[0]
|
||||||
|
@ -62,7 +62,7 @@ def setup():
|
||||||
port = os.getenv("API_PORT", "9980")
|
port = os.getenv("API_PORT", "9980")
|
||||||
|
|
||||||
global api_endpoint
|
global api_endpoint
|
||||||
api_endpoint = "http://{}:{}".format(get_api_ip(), port)
|
api_endpoint = "http://{}:{}".format(get_docker_container_ip(CONTAINER_NAME), port)
|
||||||
|
|
||||||
siad.initialize()
|
siad.initialize()
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from datetime import datetime, timedelta
|
||||||
|
|
||||||
import discord
|
import discord
|
||||||
import requests
|
import requests
|
||||||
from bot_utils import setup, send_msg
|
from bot_utils import setup, send_msg, get_docker_container_ip
|
||||||
|
|
||||||
"""
|
"""
|
||||||
health-checker reads the /health-check endpoint of the portal and dispatches
|
health-checker reads the /health-check endpoint of the portal and dispatches
|
||||||
|
@ -139,19 +139,29 @@ async def check_health():
|
||||||
print("\nChecking portal health status...")
|
print("\nChecking portal health status...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res_check = requests.get("https://127.0.0.1/health-check", verify=False)
|
endpoint = "http://{}:{}".format(get_docker_container_ip("health-check"), 3100)
|
||||||
json_check = res_check.json()
|
|
||||||
json_critical = requests.get(
|
|
||||||
"https://127.0.0.1/health-check/critical", verify=False
|
|
||||||
).json()
|
|
||||||
json_extended = requests.get(
|
|
||||||
"https://127.0.0.1/health-check/extended", verify=False
|
|
||||||
).json()
|
|
||||||
except:
|
except:
|
||||||
trace = traceback.format_exc()
|
message = "Could not get health check service endpoint api!"
|
||||||
print("[DEBUG] check_health() failed.")
|
return await send_msg(client, message, force_notify=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = requests.get(endpoint + "/health-check", verify=False)
|
||||||
|
json_check = res.json()
|
||||||
|
|
||||||
|
server_down = res.status_code is not requests.codes["ok"]
|
||||||
|
|
||||||
|
res = requests.get(endpoint + "/health-check/critical", verify=False)
|
||||||
|
json_critical = res.json()
|
||||||
|
|
||||||
|
res = requests.get(endpoint + "/health-check/extended", verify=False)
|
||||||
|
json_extended = res.json()
|
||||||
|
except:
|
||||||
|
message = traceback.format_exc()
|
||||||
|
message += "\n" + "Request url: " + res.url if res.url else "-"
|
||||||
|
message += "\n" + "Status code: " + str(res.status_code) if res.status_code else "-"
|
||||||
|
message += "\n" + "Response body: " + res.text if res.text else "-"
|
||||||
return await send_msg(
|
return await send_msg(
|
||||||
client, "Failed to run the checks!", file=trace, force_notify=True
|
client, "Failed to run health checks!", file=message, force_notify=True
|
||||||
)
|
)
|
||||||
|
|
||||||
critical_checks_total = 0
|
critical_checks_total = 0
|
||||||
|
@ -200,7 +210,7 @@ async def check_health():
|
||||||
|
|
||||||
if json_check["disabled"]:
|
if json_check["disabled"]:
|
||||||
message += "__Portal manually disabled!__ "
|
message += "__Portal manually disabled!__ "
|
||||||
elif res_check.status_code is not requests.codes["ok"]:
|
elif server_down:
|
||||||
message += "__Portal down!!!__ "
|
message += "__Portal down!!!__ "
|
||||||
force_notify = True
|
force_notify = True
|
||||||
|
|
||||||
|
|
Reference in New Issue