Merge pull request #1260 from SkynetLabs/fix-health-check-script

fix health check script invalid syntax
This commit is contained in:
Ivaylo Novakov 2021-10-06 14:36:46 +02:00 committed by GitHub
commit 1aa3928236
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 223 additions and 71 deletions

37
.github/workflows/python-lint.yml vendored Normal file
View File

@ -0,0 +1,37 @@
name: Python Lint
on:
push:
paths:
- "**.py"
jobs:
black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.x"
architecture: x64
- run: pip install black
- run: black --check .
flake8:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.x"
architecture: x64
- run: pip install flake8
# E203: https://www.flake8rules.com/rules/E203.html - Whitespace before ':'
# E501: https://www.flake8rules.com/rules/E501.html - Line too long
# W503: https://www.flake8rules.com/rules/W503.html - Line break occurred before a binary operator
# W605: https://www.flake8rules.com/rules/W605.html - Invalid escape sequence
# E722: https://www.flake8rules.com/rules/E722.html - Do not use bare except, specify exception instead
- run: flake8 --max-line-length 88 --ignore E203,E501,W503,W605,E722

View File

@ -8,43 +8,70 @@ import sys
TIMEOUT = 120
def main():
if len(sys.argv) != 3:
print('USAGE: [INDEX_PREFIX=(default "")] [ARCHIVE=(default false)] ... {} NUM_OF_DAYS http://HOSTNAME[:PORT]'.format(sys.argv[0]))
print('NUM_OF_DAYS ... delete indices that are older than the given number of days.')
print('HOSTNAME ... specifies which Elasticsearch hosts URL to search and delete indices from.')
print('TIMEOUT ... number of seconds to wait for master node response.'.format(TIMEOUT))
print('INDEX_PREFIX ... specifies index prefix.')
print('INDEX_DATE_SEPARATOR ... specifies index date separator.')
print('ARCHIVE ... specifies whether to remove archive indices (only works for rollover) (default false).')
print('ROLLOVER ... specifies whether to remove indices created by rollover (default false).')
print('ES_USERNAME ... The username required by Elasticsearch.')
print('ES_PASSWORD ... The password required by Elasticsearch.')
print('ES_TLS ... enable TLS (default false).')
print('ES_TLS_CA ... Path to TLS CA file.')
print('ES_TLS_CERT ... Path to TLS certificate file.')
print('ES_TLS_KEY ... Path to TLS key file.')
print('ES_TLS_SKIP_HOST_VERIFY ... (insecure) Skip server\'s certificate chain and host name verification.')
print(
'USAGE: [INDEX_PREFIX=(default "")] [ARCHIVE=(default false)] ... {} NUM_OF_DAYS http://HOSTNAME[:PORT]'.format(
sys.argv[0]
)
)
print(
"NUM_OF_DAYS ... delete indices that are older than the given number of days."
)
print(
"HOSTNAME ... specifies which Elasticsearch hosts URL to search and delete indices from."
)
print(
"TIMEOUT ... number of seconds to wait for master node response, default: {}".format(
TIMEOUT
)
)
print("INDEX_PREFIX ... specifies index prefix.")
print("INDEX_DATE_SEPARATOR ... specifies index date separator.")
print(
"ARCHIVE ... specifies whether to remove archive indices (only works for rollover) (default false)."
)
print(
"ROLLOVER ... specifies whether to remove indices created by rollover (default false)."
)
print("ES_USERNAME ... The username required by Elasticsearch.")
print("ES_PASSWORD ... The password required by Elasticsearch.")
print("ES_TLS ... enable TLS (default false).")
print("ES_TLS_CA ... Path to TLS CA file.")
print("ES_TLS_CERT ... Path to TLS certificate file.")
print("ES_TLS_KEY ... Path to TLS key file.")
print(
"ES_TLS_SKIP_HOST_VERIFY ... (insecure) Skip server's certificate chain and host name verification."
)
sys.exit(1)
client = create_client(os.getenv("ES_USERNAME"), os.getenv("ES_PASSWORD"), str2bool(os.getenv("ES_TLS", 'false')), os.getenv("ES_TLS_CA"), os.getenv("ES_TLS_CERT"), os.getenv("ES_TLS_KEY"), str2bool(os.getenv("ES_TLS_SKIP_HOST_VERIFY", 'false')))
client = create_client(
os.getenv("ES_USERNAME"),
os.getenv("ES_PASSWORD"),
str2bool(os.getenv("ES_TLS", "false")),
os.getenv("ES_TLS_CA"),
os.getenv("ES_TLS_CERT"),
os.getenv("ES_TLS_KEY"),
str2bool(os.getenv("ES_TLS_SKIP_HOST_VERIFY", "false")),
)
ilo = curator.IndexList(client)
empty_list(ilo, 'Elasticsearch has no indices')
empty_list(ilo, "Elasticsearch has no indices")
prefix = os.getenv("INDEX_PREFIX", '')
if prefix != '':
prefix += '-'
separator = os.getenv("INDEX_DATE_SEPARATOR", '-')
prefix = os.getenv("INDEX_PREFIX", "")
if prefix != "":
prefix += "-"
separator = os.getenv("INDEX_DATE_SEPARATOR", "-")
if str2bool(os.getenv("ARCHIVE", 'false')):
if str2bool(os.getenv("ARCHIVE", "false")):
filter_archive_indices_rollover(ilo, prefix)
else:
if str2bool(os.getenv("ROLLOVER", 'false')):
if str2bool(os.getenv("ROLLOVER", "false")):
filter_main_indices_rollover(ilo, prefix)
else:
filter_main_indices(ilo, prefix, separator)
empty_list(ilo, 'No indices to delete')
empty_list(ilo, "No indices to delete")
for index in ilo.working_list():
print("Removing", index)
@ -57,32 +84,50 @@ def filter_main_indices(ilo, prefix, separator):
date_regex = "\d{4}" + separator + "\d{2}" + separator + "\d{2}"
time_string = "%Y" + separator + "%m" + separator + "%d"
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-(span|service|dependencies)-" + date_regex)
ilo.filter_by_regex(
kind="regex", value=prefix + "jaeger-(span|service|dependencies)-" + date_regex
)
empty_list(ilo, "No indices to delete")
# This excludes archive index as we use source='name'
# source `creation_date` would include archive index
ilo.filter_by_age(source='name', direction='older', timestring=time_string, unit='days', unit_count=int(sys.argv[1]))
ilo.filter_by_age(
source="name",
direction="older",
timestring=time_string,
unit="days",
unit_count=int(sys.argv[1]),
)
def filter_main_indices_rollover(ilo, prefix):
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-(span|service)-\d{6}")
ilo.filter_by_regex(kind="regex", value=prefix + "jaeger-(span|service)-\d{6}")
empty_list(ilo, "No indices to delete")
# do not remove active write indices
ilo.filter_by_alias(aliases=[prefix + 'jaeger-span-write'], exclude=True)
ilo.filter_by_alias(aliases=[prefix + "jaeger-span-write"], exclude=True)
empty_list(ilo, "No indices to delete")
ilo.filter_by_alias(aliases=[prefix + 'jaeger-service-write'], exclude=True)
ilo.filter_by_alias(aliases=[prefix + "jaeger-service-write"], exclude=True)
empty_list(ilo, "No indices to delete")
ilo.filter_by_age(source='creation_date', direction='older', unit='days', unit_count=int(sys.argv[1]))
ilo.filter_by_age(
source="creation_date",
direction="older",
unit="days",
unit_count=int(sys.argv[1]),
)
def filter_archive_indices_rollover(ilo, prefix):
# Remove only rollover archive indices
# Do not remove active write archive index
ilo.filter_by_regex(kind='regex', value=prefix + "jaeger-span-archive-\d{6}")
ilo.filter_by_regex(kind="regex", value=prefix + "jaeger-span-archive-\d{6}")
empty_list(ilo, "No indices to delete")
ilo.filter_by_alias(aliases=[prefix + 'jaeger-span-archive-write'], exclude=True)
ilo.filter_by_alias(aliases=[prefix + "jaeger-span-archive-write"], exclude=True)
empty_list(ilo, "No indices to delete")
ilo.filter_by_age(source='creation_date', direction='older', unit='days', unit_count=int(sys.argv[1]))
ilo.filter_by_age(
source="creation_date",
direction="older",
unit="days",
unit_count=int(sys.argv[1]),
)
def empty_list(ilo, error_msg):
@ -94,7 +139,7 @@ def empty_list(ilo, error_msg):
def str2bool(v):
return v.lower() in ('true', '1')
return v.lower() in ("true", "1")
def create_client(username, password, tls, ca, cert, key, skipHostVerify):
@ -105,7 +150,9 @@ def create_client(username, password, tls, ca, cert, key, skipHostVerify):
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
if username is not None and password is not None:
return elasticsearch.Elasticsearch(sys.argv[2:], http_auth=(username, password), ssl_context=context)
return elasticsearch.Elasticsearch(
sys.argv[2:], http_auth=(username, password), ssl_context=context
)
elif tls:
context.load_cert_chain(certfile=cert, keyfile=key)
return elasticsearch.Elasticsearch(sys.argv[2:], ssl_context=context)

View File

@ -1,10 +1,16 @@
#!/usr/bin/env python3
import traceback, os, re, asyncio, requests, json
from bot_utils import setup, send_msg
from random import randint
from time import sleep
import traceback
import os
import re
import asyncio
import requests
import json
setup()
AIRTABLE_API_KEY = os.getenv("AIRTABLE_API_KEY")
@ -12,6 +18,7 @@ AIRTABLE_BASE = os.getenv("AIRTABLE_BASE", "app89plJvA9EqTJEc")
AIRTABLE_TABLE = os.getenv("AIRTABLE_TABLE", "Table%201")
AIRTABLE_FIELD = os.getenv("AIRTABLE_FIELD", "Link")
async def run_checks():
try:
await block_skylinks_from_airtable()
@ -31,10 +38,22 @@ async def block_skylinks_from_airtable():
offset = None
retry = 0
while len(skylinks) == 0 or offset:
print("Requesting a batch of records from Airtable with " + (offset if offset else "empty") + " offset" + (" (retry " + str(retry) + ")" if retry else ""))
query = "&".join(["fields%5B%5D=" + AIRTABLE_FIELD, ("offset=" + offset) if offset else ""])
print(
"Requesting a batch of records from Airtable with "
+ (offset if offset else "empty")
+ " offset"
+ (" (retry " + str(retry) + ")" if retry else "")
)
query = "&".join(
["fields%5B%5D=" + AIRTABLE_FIELD, ("offset=" + offset) if offset else ""]
)
response = requests.get(
"https://api.airtable.com/v0/" + AIRTABLE_BASE + "/" + AIRTABLE_TABLE + "?" + query,
"https://api.airtable.com/v0/"
+ AIRTABLE_BASE
+ "/"
+ AIRTABLE_TABLE
+ "?"
+ query,
headers=headers,
)
@ -47,37 +66,59 @@ async def block_skylinks_from_airtable():
sleep(randint(1, 10))
continue
else:
return await send_msg("Airtable: too many retries, aborting!", force_notify=True)
return await send_msg(
"Airtable: too many retries, aborting!", force_notify=True
)
retry = 0 # reset retry counter
if response.status_code != 200:
status_code = str(response.status_code)
response_text = response.text or "empty response"
message = "Airtable blocklist integration responded with code " + status_code + ": " + response_text
message = (
"Airtable blocklist integration responded with code "
+ status_code
+ ": "
+ response_text
)
return await send_msg(message, force_notify=False)
data = response.json()
if len(data["records"]) == 0:
return print("Airtable returned 0 records - make sure your configuration is correct")
return print(
"Airtable returned 0 records - make sure your configuration is correct"
)
skylinks = skylinks + [entry["fields"].get(AIRTABLE_FIELD, "") for entry in data["records"]]
skylinks = [skylink for skylink in skylinks if skylink] # filter empty skylinks, most likely empty rows
skylinks = skylinks + [
entry["fields"].get(AIRTABLE_FIELD, "") for entry in data["records"]
]
skylinks = [
skylink for skylink in skylinks if skylink
] # filter empty skylinks, most likely empty rows
offset = data.get("offset")
print("Airtable returned total " + str(len(skylinks)) + " skylinks to block")
skylinks_returned = skylinks
skylinks = [skylink for skylink in skylinks if re.search("^[a-zA-Z0-9_-]{46}$", skylink)]
skylinks = [
skylink for skylink in skylinks if re.search("^[a-zA-Z0-9_-]{46}$", skylink)
]
if len(skylinks_returned) != len(skylinks):
invalid_skylinks = [str(skylink) for skylink in list(set(skylinks_returned) - set(skylinks))]
message = str(len(invalid_skylinks)) + " of the skylinks returned from Airtable are not valid"
invalid_skylinks = [
str(skylink) for skylink in list(set(skylinks_returned) - set(skylinks))
]
message = (
str(len(invalid_skylinks))
+ " of the skylinks returned from Airtable are not valid"
)
await send_msg(message, file=("\n".join(invalid_skylinks)))
apipassword = exec("docker exec sia cat /sia-data/apipassword")
ipaddress = exec("docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' sia")
ipaddress = exec(
"docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' sia"
)
print("Sending blocklist request to siad")
response = requests.post(
@ -92,7 +133,12 @@ async def block_skylinks_from_airtable():
else:
status_code = str(response.status_code)
response_text = response.text or "empty response"
message = "Siad blocklist endpoint responded with code " + status_code + ": " + response_text
message = (
"Siad blocklist endpoint responded with code "
+ status_code
+ ": "
+ response_text
)
return await send_msg(message, force_notify=False)
print("Searching nginx cache for blocked files")
@ -104,12 +150,20 @@ async def block_skylinks_from_airtable():
+ "|".join(skylinks[i : i + batch_size])
+ ")'"
)
cached_files_count+= int(exec('docker exec nginx bash -c "' + cached_files_command + ' | xargs -r rm -v | wc -l"'))
cached_files_count += int(
exec(
'docker exec nginx bash -c "'
+ cached_files_command
+ ' | xargs -r rm -v | wc -l"'
)
)
if cached_files_count == 0:
return print("No nginx cached files matching blocked skylinks were found")
message = "Purged " + str(cached_files_count) + " blocklisted files from nginx cache"
message = (
"Purged " + str(cached_files_count) + " blocklisted files from nginx cache"
)
return await send_msg(message)

View File

@ -1,12 +1,19 @@
#!/usr/bin/env python3
from urllib.request import urlopen, Request
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime
from discord_webhook import DiscordWebhook
import urllib, json, os, traceback, sys, re, subprocess, requests, io
import urllib
import json
import os
import traceback
import sys
import re
import subprocess
import requests
import io
# Load dotenv file if possible.
# TODO: change all scripts to use named flags/params
@ -25,6 +32,7 @@ sc_precision = 10 ** 24
# Environment variable globals
setup_done = False
# find out local siad ip by inspecting its docker container
def get_docker_container_ip(container_name):
ip_regex = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
@ -41,6 +49,7 @@ api_endpoint = "http://{}:{}".format(
get_docker_container_ip(CONTAINER_NAME), os.getenv("API_PORT", "9980")
)
# find siad api password by getting it out of the docker container
def get_api_password():
api_password_regex = re.compile(r"^\w+$")
@ -56,7 +65,8 @@ def setup():
setup_done = True
# send_msg sends the msg to the specified discord channel. If force_notify is set to true it adds "@here".
# send_msg sends the msg to the specified discord channel.
# If force_notify is set to true it adds "@here".
async def send_msg(msg, force_notify=False, file=None):
try:
webhook_url = os.getenv("DISCORD_WEBHOOK_URL")

View File

@ -5,9 +5,11 @@ funds-checker runs simple checks on a portal node using the siad API and
dispatches messages to a Discord channel.
"""
import traceback, asyncio, os
from bot_utils import setup, send_msg, siad, sc_precision
import traceback
import asyncio
setup()

View File

@ -105,7 +105,7 @@ async def check_disk():
)
inspect = os.popen("docker inspect sia").read().strip()
inspect_json = json.loads(inspect)
if inspect_json[0]["State"]["Running"] == True:
if inspect_json[0]["State"]["Running"] is True:
# mark portal as unhealthy
os.popen("docker exec health-check cli/disable")
time.sleep(300) # wait 5 minutes to propagate dns changes
@ -133,7 +133,10 @@ async def check_health():
res = requests.get(endpoint + "/health-check", verify=False)
json_check = res.json()
server_failure = res.status_code is not requests.codes["ok"] and json_check["disabled"] == False:
server_failure = (
res.status_code is not requests.codes["ok"]
and json_check["disabled"] is False
)
res = requests.get(endpoint + "/health-check/critical", verify=False)
json_critical = res.json()
@ -171,12 +174,12 @@ async def check_health():
bad = False
for check in critical["checks"]:
critical_checks_total += 1
if check["up"] == False:
if check["up"] is False:
critical_checks_failed += 1
bad = True
if bad:
critical["checks"] = [
check for check in critical["checks"] if check["up"] == False
check for check in critical["checks"] if check["up"] is False
]
failed_records.append(critical)
@ -187,12 +190,12 @@ async def check_health():
bad = False
for check in extended["checks"]:
extended_checks_total += 1
if check["up"] == False:
if check["up"] is False:
extended_checks_failed += 1
bad = True
if bad:
extended["checks"] = [
check for check in extended["checks"] if check["up"] == False
check for check in extended["checks"] if check["up"] is False
]
failed_records.append(extended)
@ -227,11 +230,7 @@ async def check_health():
failed_records_file = json.dumps(failed_records, indent=2)
# send a message if we force notification, there is a failures dump or just once daily (heartbeat) on 1 AM
if (
force_notify
or failed_records_file
or datetime.utcnow().hour == 1
):
if force_notify or failed_records_file or datetime.utcnow().hour == 1:
return await send_msg(
message, file=failed_records_file, force_notify=force_notify
)

View File

@ -1,9 +1,12 @@
#!/usr/bin/env python3
import sys, traceback, io, os, asyncio
from bot_utils import setup, send_msg, upload_to_skynet
from bot_utils import setup, send_msg
from subprocess import Popen, PIPE
import sys
import traceback
import asyncio
"""
log-checker checks the docker logs for siad.