Merge pull request #1260 from SkynetLabs/fix-health-check-script

fix health check script invalid syntax
This commit is contained in:
Ivaylo Novakov 2021-10-06 14:36:46 +02:00 committed by GitHub
commit 1aa3928236
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 223 additions and 71 deletions

37
.github/workflows/python-lint.yml vendored Normal file
View File

@ -0,0 +1,37 @@
name: Python Lint
on:
push:
paths:
- "**.py"
jobs:
black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.x"
architecture: x64
- run: pip install black
- run: black --check .
flake8:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.x"
architecture: x64
- run: pip install flake8
# E203: https://www.flake8rules.com/rules/E203.html - Whitespace before ':'
# E501: https://www.flake8rules.com/rules/E501.html - Line too long
# W503: https://www.flake8rules.com/rules/W503.html - Line break occurred before a binary operator
# W605: https://www.flake8rules.com/rules/W605.html - Invalid escape sequence
# E722: https://www.flake8rules.com/rules/E722.html - Do not use bare except, specify exception instead
- run: flake8 --max-line-length 88 --ignore E203,E501,W503,W605,E722

View File

@ -8,43 +8,70 @@ import sys
TIMEOUT = 120 TIMEOUT = 120
def main(): def main():
if len(sys.argv) != 3: if len(sys.argv) != 3:
print('USAGE: [INDEX_PREFIX=(default "")] [ARCHIVE=(default false)] ... {} NUM_OF_DAYS http://HOSTNAME[:PORT]'.format(sys.argv[0])) print(
print('NUM_OF_DAYS ... delete indices that are older than the given number of days.') 'USAGE: [INDEX_PREFIX=(default "")] [ARCHIVE=(default false)] ... {} NUM_OF_DAYS http://HOSTNAME[:PORT]'.format(
print('HOSTNAME ... specifies which Elasticsearch hosts URL to search and delete indices from.') sys.argv[0]
print('TIMEOUT ... number of seconds to wait for master node response.'.format(TIMEOUT)) )
print('INDEX_PREFIX ... specifies index prefix.') )
print('INDEX_DATE_SEPARATOR ... specifies index date separator.') print(
print('ARCHIVE ... specifies whether to remove archive indices (only works for rollover) (default false).') "NUM_OF_DAYS ... delete indices that are older than the given number of days."
print('ROLLOVER ... specifies whether to remove indices created by rollover (default false).') )
print('ES_USERNAME ... The username required by Elasticsearch.') print(
print('ES_PASSWORD ... The password required by Elasticsearch.') "HOSTNAME ... specifies which Elasticsearch hosts URL to search and delete indices from."
print('ES_TLS ... enable TLS (default false).') )
print('ES_TLS_CA ... Path to TLS CA file.') print(
print('ES_TLS_CERT ... Path to TLS certificate file.') "TIMEOUT ... number of seconds to wait for master node response, default: {}".format(
print('ES_TLS_KEY ... Path to TLS key file.') TIMEOUT
print('ES_TLS_SKIP_HOST_VERIFY ... (insecure) Skip server\'s certificate chain and host name verification.') )
)
print("INDEX_PREFIX ... specifies index prefix.")
print("INDEX_DATE_SEPARATOR ... specifies index date separator.")
print(
"ARCHIVE ... specifies whether to remove archive indices (only works for rollover) (default false)."
)
print(
"ROLLOVER ... specifies whether to remove indices created by rollover (default false)."
)
print("ES_USERNAME ... The username required by Elasticsearch.")
print("ES_PASSWORD ... The password required by Elasticsearch.")
print("ES_TLS ... enable TLS (default false).")
print("ES_TLS_CA ... Path to TLS CA file.")
print("ES_TLS_CERT ... Path to TLS certificate file.")
print("ES_TLS_KEY ... Path to TLS key file.")
print(
"ES_TLS_SKIP_HOST_VERIFY ... (insecure) Skip server's certificate chain and host name verification."
)
sys.exit(1) sys.exit(1)
client = create_client(os.getenv("ES_USERNAME"), os.getenv("ES_PASSWORD"), str2bool(os.getenv("ES_TLS", 'false')), os.getenv("ES_TLS_CA"), os.getenv("ES_TLS_CERT"), os.getenv("ES_TLS_KEY"), str2bool(os.getenv("ES_TLS_SKIP_HOST_VERIFY", 'false'))) client = create_client(
os.getenv("ES_USERNAME"),
os.getenv("ES_PASSWORD"),
str2bool(os.getenv("ES_TLS", "false")),
os.getenv("ES_TLS_CA"),
os.getenv("ES_TLS_CERT"),
os.getenv("ES_TLS_KEY"),
str2bool(os.getenv("ES_TLS_SKIP_HOST_VERIFY", "false")),
)
ilo = curator.IndexList(client) ilo = curator.IndexList(client)
empty_list(ilo, 'Elasticsearch has no indices') empty_list(ilo, "Elasticsearch has no indices")
prefix = os.getenv("INDEX_PREFIX", '') prefix = os.getenv("INDEX_PREFIX", "")
if prefix != '': if prefix != "":
prefix += '-' prefix += "-"
separator = os.getenv("INDEX_DATE_SEPARATOR", '-') separator = os.getenv("INDEX_DATE_SEPARATOR", "-")
if str2bool(os.getenv("ARCHIVE", 'false')): if str2bool(os.getenv("ARCHIVE", "false")):
filter_archive_indices_rollover(ilo, prefix) filter_archive_indices_rollover(ilo, prefix)
else: else:
if str2bool(os.getenv("ROLLOVER", 'false')): if str2bool(os.getenv("ROLLOVER", "false")):
filter_main_indices_rollover(ilo, prefix) filter_main_indices_rollover(ilo, prefix)
else: else:
filter_main_indices(ilo, prefix, separator) filter_main_indices(ilo, prefix, separator)
empty_list(ilo, 'No indices to delete') empty_list(ilo, "No indices to delete")
for index in ilo.working_list(): for index in ilo.working_list():
print("Removing", index) print("Removing", index)
@ -57,32 +84,50 @@ def filter_main_indices(ilo, prefix, separator):
date_regex = "\d{4}" + separator + "\d{2}" + separator + "\d{2}" date_regex = "\d{4}" + separator + "\d{2}" + separator + "\d{2}"
time_string = "%Y" + separator + "%m" + separator + "%d" time_string = "%Y" + separator + "%m" + separator + "%d"
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-(span|service|dependencies)-" + date_regex) ilo.filter_by_regex(
kind="regex", value=prefix + "jaeger-(span|service|dependencies)-" + date_regex
)
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
# This excludes archive index as we use source='name' # This excludes archive index as we use source='name'
# source `creation_date` would include archive index # source `creation_date` would include archive index
ilo.filter_by_age(source='name', direction='older', timestring=time_string, unit='days', unit_count=int(sys.argv[1])) ilo.filter_by_age(
source="name",
direction="older",
timestring=time_string,
unit="days",
unit_count=int(sys.argv[1]),
)
def filter_main_indices_rollover(ilo, prefix): def filter_main_indices_rollover(ilo, prefix):
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-(span|service)-\d{6}") ilo.filter_by_regex(kind="regex", value=prefix + "jaeger-(span|service)-\d{6}")
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
# do not remove active write indices # do not remove active write indices
ilo.filter_by_alias(aliases=[prefix + 'jaeger-span-write'], exclude=True) ilo.filter_by_alias(aliases=[prefix + "jaeger-span-write"], exclude=True)
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
ilo.filter_by_alias(aliases=[prefix + 'jaeger-service-write'], exclude=True) ilo.filter_by_alias(aliases=[prefix + "jaeger-service-write"], exclude=True)
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
ilo.filter_by_age(source='creation_date', direction='older', unit='days', unit_count=int(sys.argv[1])) ilo.filter_by_age(
source="creation_date",
direction="older",
unit="days",
unit_count=int(sys.argv[1]),
)
def filter_archive_indices_rollover(ilo, prefix): def filter_archive_indices_rollover(ilo, prefix):
# Remove only rollover archive indices # Remove only rollover archive indices
# Do not remove active write archive index # Do not remove active write archive index
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-span-archive-\d{6}") ilo.filter_by_regex(kind="regex", value=prefix + "jaeger-span-archive-\d{6}")
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
ilo.filter_by_alias(aliases=[prefix + 'jaeger-span-archive-write'], exclude=True) ilo.filter_by_alias(aliases=[prefix + "jaeger-span-archive-write"], exclude=True)
empty_list(ilo, "No indices to delete") empty_list(ilo, "No indices to delete")
ilo.filter_by_age(source='creation_date', direction='older', unit='days', unit_count=int(sys.argv[1])) ilo.filter_by_age(
source="creation_date",
direction="older",
unit="days",
unit_count=int(sys.argv[1]),
)
def empty_list(ilo, error_msg): def empty_list(ilo, error_msg):
@ -94,7 +139,7 @@ def empty_list(ilo, error_msg):
def str2bool(v): def str2bool(v):
return v.lower() in ('true', '1') return v.lower() in ("true", "1")
def create_client(username, password, tls, ca, cert, key, skipHostVerify): def create_client(username, password, tls, ca, cert, key, skipHostVerify):
@ -105,7 +150,9 @@ def create_client(username, password, tls, ca, cert, key, skipHostVerify):
context.check_hostname = False context.check_hostname = False
context.verify_mode = ssl.CERT_NONE context.verify_mode = ssl.CERT_NONE
if username is not None and password is not None: if username is not None and password is not None:
return elasticsearch.Elasticsearch(sys.argv[2:], http_auth=(username, password), ssl_context=context) return elasticsearch.Elasticsearch(
sys.argv[2:], http_auth=(username, password), ssl_context=context
)
elif tls: elif tls:
context.load_cert_chain(certfile=cert, keyfile=key) context.load_cert_chain(certfile=cert, keyfile=key)
return elasticsearch.Elasticsearch(sys.argv[2:], ssl_context=context) return elasticsearch.Elasticsearch(sys.argv[2:], ssl_context=context)

View File

@ -1,10 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import traceback, os, re, asyncio, requests, json
from bot_utils import setup, send_msg from bot_utils import setup, send_msg
from random import randint from random import randint
from time import sleep from time import sleep
import traceback
import os
import re
import asyncio
import requests
import json
setup() setup()
AIRTABLE_API_KEY = os.getenv("AIRTABLE_API_KEY") AIRTABLE_API_KEY = os.getenv("AIRTABLE_API_KEY")
@ -12,6 +18,7 @@ AIRTABLE_BASE = os.getenv("AIRTABLE_BASE", "app89plJvA9EqTJEc")
AIRTABLE_TABLE = os.getenv("AIRTABLE_TABLE", "Table%201") AIRTABLE_TABLE = os.getenv("AIRTABLE_TABLE", "Table%201")
AIRTABLE_FIELD = os.getenv("AIRTABLE_FIELD", "Link") AIRTABLE_FIELD = os.getenv("AIRTABLE_FIELD", "Link")
async def run_checks(): async def run_checks():
try: try:
await block_skylinks_from_airtable() await block_skylinks_from_airtable()
@ -31,10 +38,22 @@ async def block_skylinks_from_airtable():
offset = None offset = None
retry = 0 retry = 0
while len(skylinks) == 0 or offset: while len(skylinks) == 0 or offset:
print("Requesting a batch of records from Airtable with " + (offset if offset else "empty") + " offset" + (" (retry " + str(retry) + ")" if retry else "")) print(
query = "&".join(["fields%5B%5D=" + AIRTABLE_FIELD, ("offset=" + offset) if offset else ""]) "Requesting a batch of records from Airtable with "
+ (offset if offset else "empty")
+ " offset"
+ (" (retry " + str(retry) + ")" if retry else "")
)
query = "&".join(
["fields%5B%5D=" + AIRTABLE_FIELD, ("offset=" + offset) if offset else ""]
)
response = requests.get( response = requests.get(
"https://api.airtable.com/v0/" + AIRTABLE_BASE + "/" + AIRTABLE_TABLE + "?" + query, "https://api.airtable.com/v0/"
+ AIRTABLE_BASE
+ "/"
+ AIRTABLE_TABLE
+ "?"
+ query,
headers=headers, headers=headers,
) )
@ -47,37 +66,59 @@ async def block_skylinks_from_airtable():
sleep(randint(1, 10)) sleep(randint(1, 10))
continue continue
else: else:
return await send_msg("Airtable: too many retries, aborting!", force_notify=True) return await send_msg(
"Airtable: too many retries, aborting!", force_notify=True
)
retry = 0 # reset retry counter retry = 0 # reset retry counter
if response.status_code != 200: if response.status_code != 200:
status_code = str(response.status_code) status_code = str(response.status_code)
response_text = response.text or "empty response" response_text = response.text or "empty response"
message = "Airtable blocklist integration responded with code " + status_code + ": " + response_text message = (
"Airtable blocklist integration responded with code "
+ status_code
+ ": "
+ response_text
)
return await send_msg(message, force_notify=False) return await send_msg(message, force_notify=False)
data = response.json() data = response.json()
if len(data["records"]) == 0: if len(data["records"]) == 0:
return print("Airtable returned 0 records - make sure your configuration is correct") return print(
"Airtable returned 0 records - make sure your configuration is correct"
)
skylinks = skylinks + [entry["fields"].get(AIRTABLE_FIELD, "") for entry in data["records"]] skylinks = skylinks + [
skylinks = [skylink for skylink in skylinks if skylink] # filter empty skylinks, most likely empty rows entry["fields"].get(AIRTABLE_FIELD, "") for entry in data["records"]
]
skylinks = [
skylink for skylink in skylinks if skylink
] # filter empty skylinks, most likely empty rows
offset = data.get("offset") offset = data.get("offset")
print("Airtable returned total " + str(len(skylinks)) + " skylinks to block") print("Airtable returned total " + str(len(skylinks)) + " skylinks to block")
skylinks_returned = skylinks skylinks_returned = skylinks
skylinks = [skylink for skylink in skylinks if re.search("^[a-zA-Z0-9_-]{46}$", skylink)] skylinks = [
skylink for skylink in skylinks if re.search("^[a-zA-Z0-9_-]{46}$", skylink)
]
if len(skylinks_returned) != len(skylinks): if len(skylinks_returned) != len(skylinks):
invalid_skylinks = [str(skylink) for skylink in list(set(skylinks_returned) - set(skylinks))] invalid_skylinks = [
message = str(len(invalid_skylinks)) + " of the skylinks returned from Airtable are not valid" str(skylink) for skylink in list(set(skylinks_returned) - set(skylinks))
]
message = (
str(len(invalid_skylinks))
+ " of the skylinks returned from Airtable are not valid"
)
await send_msg(message, file=("\n".join(invalid_skylinks))) await send_msg(message, file=("\n".join(invalid_skylinks)))
apipassword = exec("docker exec sia cat /sia-data/apipassword") apipassword = exec("docker exec sia cat /sia-data/apipassword")
ipaddress = exec("docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' sia") ipaddress = exec(
"docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' sia"
)
print("Sending blocklist request to siad") print("Sending blocklist request to siad")
response = requests.post( response = requests.post(
@ -92,7 +133,12 @@ async def block_skylinks_from_airtable():
else: else:
status_code = str(response.status_code) status_code = str(response.status_code)
response_text = response.text or "empty response" response_text = response.text or "empty response"
message = "Siad blocklist endpoint responded with code " + status_code + ": " + response_text message = (
"Siad blocklist endpoint responded with code "
+ status_code
+ ": "
+ response_text
)
return await send_msg(message, force_notify=False) return await send_msg(message, force_notify=False)
print("Searching nginx cache for blocked files") print("Searching nginx cache for blocked files")
@ -104,12 +150,20 @@ async def block_skylinks_from_airtable():
+ "|".join(skylinks[i : i + batch_size]) + "|".join(skylinks[i : i + batch_size])
+ ")'" + ")'"
) )
cached_files_count+= int(exec('docker exec nginx bash -c "' + cached_files_command + ' | xargs -r rm -v | wc -l"')) cached_files_count += int(
exec(
'docker exec nginx bash -c "'
+ cached_files_command
+ ' | xargs -r rm -v | wc -l"'
)
)
if cached_files_count == 0: if cached_files_count == 0:
return print("No nginx cached files matching blocked skylinks were found") return print("No nginx cached files matching blocked skylinks were found")
message = "Purged " + str(cached_files_count) + " blocklisted files from nginx cache" message = (
"Purged " + str(cached_files_count) + " blocklisted files from nginx cache"
)
return await send_msg(message) return await send_msg(message)

View File

@ -1,12 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from urllib.request import urlopen, Request
from dotenv import load_dotenv from dotenv import load_dotenv
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from discord_webhook import DiscordWebhook from discord_webhook import DiscordWebhook
import urllib, json, os, traceback, sys, re, subprocess, requests, io import urllib
import json
import os
import traceback
import sys
import re
import subprocess
import requests
import io
# Load dotenv file if possible. # Load dotenv file if possible.
# TODO: change all scripts to use named flags/params # TODO: change all scripts to use named flags/params
@ -25,6 +32,7 @@ sc_precision = 10 ** 24
# Environment variable globals # Environment variable globals
setup_done = False setup_done = False
# find out local siad ip by inspecting its docker container # find out local siad ip by inspecting its docker container
def get_docker_container_ip(container_name): def get_docker_container_ip(container_name):
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
@ -41,6 +49,7 @@ api_endpoint = "http://{}:{}".format(
get_docker_container_ip(CONTAINER_NAME), os.getenv("API_PORT", "9980") get_docker_container_ip(CONTAINER_NAME), os.getenv("API_PORT", "9980")
) )
# find siad api password by getting it out of the docker container # find siad api password by getting it out of the docker container
def get_api_password(): def get_api_password():
api_password_regex = re.compile(r"^\w+$") api_password_regex = re.compile(r"^\w+$")
@ -56,7 +65,8 @@ def setup():
setup_done = True setup_done = True
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here". # send_msg sends the msg to the specified discord channel.
# If force_notify is set to true it adds "@here".
async def send_msg(msg, force_notify=False, file=None): async def send_msg(msg, force_notify=False, file=None):
try: try:
webhook_url = os.getenv("DISCORD_WEBHOOK_URL") webhook_url = os.getenv("DISCORD_WEBHOOK_URL")

View File

@ -5,9 +5,11 @@ funds-checker runs simple checks on a portal node using the siad API and
dispatches messages to a Discord channel. dispatches messages to a Discord channel.
""" """
import traceback, asyncio, os
from bot_utils import setup, send_msg, siad, sc_precision from bot_utils import setup, send_msg, siad, sc_precision
import traceback
import asyncio
setup() setup()

View File

@ -105,7 +105,7 @@ async def check_disk():
) )
inspect = os.popen("docker inspect sia").read().strip() inspect = os.popen("docker inspect sia").read().strip()
inspect_json = json.loads(inspect) inspect_json = json.loads(inspect)
if inspect_json[0]["State"]["Running"] == True: if inspect_json[0]["State"]["Running"] is True:
# mark portal as unhealthy # mark portal as unhealthy
os.popen("docker exec health-check cli/disable") os.popen("docker exec health-check cli/disable")
time.sleep(300) # wait 5 minutes to propagate dns changes time.sleep(300) # wait 5 minutes to propagate dns changes
@ -133,7 +133,10 @@ async def check_health():
res = requests.get(endpoint + "/health-check", verify=False) res = requests.get(endpoint + "/health-check", verify=False)
json_check = res.json() json_check = res.json()
server_failure = res.status_code is not requests.codes["ok"] and json_check["disabled"] == False: server_failure = (
res.status_code is not requests.codes["ok"]
and json_check["disabled"] is False
)
res = requests.get(endpoint + "/health-check/critical", verify=False) res = requests.get(endpoint + "/health-check/critical", verify=False)
json_critical = res.json() json_critical = res.json()
@ -171,12 +174,12 @@ async def check_health():
bad = False bad = False
for check in critical["checks"]: for check in critical["checks"]:
critical_checks_total += 1 critical_checks_total += 1
if check["up"] == False: if check["up"] is False:
critical_checks_failed += 1 critical_checks_failed += 1
bad = True bad = True
if bad: if bad:
critical["checks"] = [ critical["checks"] = [
check for check in critical["checks"] if check["up"] == False check for check in critical["checks"] if check["up"] is False
] ]
failed_records.append(critical) failed_records.append(critical)
@ -187,12 +190,12 @@ async def check_health():
bad = False bad = False
for check in extended["checks"]: for check in extended["checks"]:
extended_checks_total += 1 extended_checks_total += 1
if check["up"] == False: if check["up"] is False:
extended_checks_failed += 1 extended_checks_failed += 1
bad = True bad = True
if bad: if bad:
extended["checks"] = [ extended["checks"] = [
check for check in extended["checks"] if check["up"] == False check for check in extended["checks"] if check["up"] is False
] ]
failed_records.append(extended) failed_records.append(extended)
@ -227,11 +230,7 @@ async def check_health():
failed_records_file = json.dumps(failed_records, indent=2) failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM # send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if ( if force_notify or failed_records_file or datetime.utcnow().hour == 1:
force_notify
or failed_records_file
or datetime.utcnow().hour == 1
):
return await send_msg( return await send_msg(
message, file=failed_records_file, force_notify=force_notify message, file=failed_records_file, force_notify=force_notify
) )

View File

@ -1,9 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys, traceback, io, os, asyncio from bot_utils import setup, send_msg
from bot_utils import setup, send_msg, upload_to_skynet
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
import sys
import traceback
import asyncio
""" """
log-checker checks the docker logs for siad. log-checker checks the docker logs for siad.