-
Pelle Koster authoredPelle Koster authored
cli.py 10.36 KiB
from datetime import datetime
import json
import logging
import os
import pathlib
from typing import Sequence
from brian_polling_manager.interface_stats.services import influx_client
from brian_polling_manager.inventory import load_interfaces
from influxdb import InfluxDBClient
from brian_polling_manager.error_report.config import load
from brian_polling_manager.error_report.mailer import render_html
logger = logging.getLogger(__name__)
CONFIG_FILE = "/home/pellek/develop/klanten/geant/error_report-config.json"
INFLUX_TIME_WINDOW_TODAY = "time > now() - 1d"
INFLUX_TIME_WINDOW_YESTERDAY = "time < now() - 1d and time > now() - 2d"
# The error field names in the influx query vs their reporting name
ERROR_FIELDS = {
"last_input_framing_errors": "framing-errors",
"last_bit_error_seconds": "bit-error-seconds",
"last_errored_blocks_seconds": "errored-blocks-seconds",
"last_input_crc_errors": "input-crc-errors",
"last_input_total_errors": "input-total-errors",
"last_input_discards": "input-discards",
"last_input_drops": "input-drops",
"last_output_drops": "output-drops",
}
PROCESSED_ERROR_COUNTERS_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"definitions": {
"error_counters_content": {
"type": "object",
"properties": {
"framing-errors": {"type": "integer"},
"bit-error-seconds": {"type": "integer"},
"errored-blocks-seconds": {"type": "integer"},
"input-crc-errors": {"type": "integer"},
"input-total-errors": {"type": "integer"},
"input-discards": {"type": "integer"},
"input-drops": {"type": "integer"},
"output-drops": {"type": "integer"},
},
"additionalProperties": False,
},
"interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
"diff": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
],
"additionalProperties": False,
},
"excluded_interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
],
"additionalProperties": False,
},
},
"type": "object",
"properties": {
"interfaces": {
"type": "array",
"items": {"$ref": "#/definitions/interface_error_counters"},
},
"excluded_interfaces": {
"type": "array",
"items": {"$ref": "#/definitions/excluded_interface_error_counters"},
},
},
"required": ["interfaces", "excluded_interfaces"],
"additionalProperties": False,
}
LOGGING_DEFAULT_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {"simple": {"format": "%(asctime)s - %(levelname)s - %(message)s"}},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"level": "INFO",
"formatter": "simple",
"stream": "ext://sys.stderr",
},
},
"loggers": {
"brian_polling_manager": {
"level": "INFO",
"handlers": ["console"],
"propagate": False,
}
},
"root": {"level": "INFO", "handlers": ["console"]},
}
def setup_logging():
"""
set up logging using the configured filename
if LOGGING_CONFIG is defined in the environment, use this for
the filename, otherwise use LOGGING_DEFAULT_CONFIG
"""
logging_config = LOGGING_DEFAULT_CONFIG
if "LOGGING_CONFIG" in os.environ:
filename = os.environ["LOGGING_CONFIG"]
with open(filename) as f:
logging_config = json.loads(f.read())
logging.config.dictConfig(logging_config)
def get_error_points(client: InfluxDBClient, time_window: str):
"""Get the last value for every error field for every (router, interface)
:param client: an `InfluxDBCLient`_
:param time_window: an influx time window such as `INFLUX_TIME_WINDOW_TODAY` or
`INFLUX_TIME_WINDOW_YESTERDAY_
:returns: a dict {(router, interface): error_point } were error_point is a dict
with all the error field values for that respective interface
"""
raw_data = client.query(
# This query may actually return values from mulitple different points if
# some values are missing for the last point. But it's close enough.
(
f"SELECT last(*) FROM errors WHERE {time_window} "
"group by hostname, interface_name;"
)
)
return {
(tags["hostname"], tags["interface_name"]): next(points, {})
for (_, tags), points in raw_data.items()
}
def select_error_fields(errors, mapping):
# the extra `or 0` is for substituting None values
return {tgt: errors.get(src, 0) or 0 for src, tgt in mapping.items()}
def interface_errors(
client: InfluxDBClient, interface_info, errors, exclusions=(), raise_on_errors=False
):
"""
Retrieves error counters from influx
:param client: InfluxDBClient for connecting to influx
:param interface_info: a dict of {(router, interface): info_dict} with interface
information coming from invprov (ie. the output from `get_relevant_interfaces`_)
:param errors: A dict of (input_data_field: result_field) for every error to report
on (see `ERROR_FIELDS`_)
:param raise_on_errors: raise when certain exceptions occur (useful for testing)
:result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA
"""
todays_data = {
key: select_error_fields(val, mapping=errors)
for key, val in get_error_points(client, INFLUX_TIME_WINDOW_TODAY).items()
}
yesterdays_data = {
key: select_error_fields(val, mapping=errors)
for key, val in get_error_points(client, INFLUX_TIME_WINDOW_YESTERDAY).items()
}
result = {"interfaces": [], "excluded_interfaces": []}
for (router, ifc), info in interface_info.items():
try:
today = todays_data[(router, ifc)]
except KeyError:
logger.error(f"{router} - {ifc} not found in influx data")
if raise_on_errors:
raise
continue
if not any(err > 0 for err in today.values()):
# skip interfaces without any errors
continue
yesterday = yesterdays_data.get((router, ifc), {})
counters = {
"router": router,
"interface": ifc,
"error_counters": today,
"description": info["description"],
}
if not is_excluded_interface(info["description"], exclusions):
nonzero_errors = {err: val for err, val in today.items() if val > 0}
counters["error_counters"] = nonzero_errors
if any(yesterday.values()):
# we have existing errors
# This is strictly not the most correct way to determine differences.
# during the day the error count may have reset and diffs may actually
# be negative, but we ignore those because that is (mostly) how it was
# done in the orginal bash script
diff = {
err: (val - yesterday[err])
for err, val in nonzero_errors.items()
if (val - yesterday[err]) > 0
}
if not diff:
# Skip interface if it does not have any increased error counters
continue
counters["diff"] = diff
result["interfaces"].append(counters)
else:
logger.info(f"Found excluded interface {router} - {ifc}")
result["excluded_interfaces"].append(counters)
return result
def is_excluded_interface(description: str, exclusions: Sequence[str]):
"""Some interfaces generate a lot of noise and should be excluded"""
# We may want to put this logic inside inventory provider
return any(excl.lower() in description.lower() for excl in exclusions)
def get_relevant_interfaces(hosts):
"""Get interface info from inventory provider. Some interfaces are considered
irrelevant based on there description"""
return _filter_and_convert_interfaces(load_interfaces(hosts))
def _filter_and_convert_interfaces(interfaces):
# We may want to put this logic inside inventory provider and serve from a new
# endpoint
return dict(
sorted(
((i["router"], i["name"]), i)
for i in interfaces
if all(
(
"PHY" in i["description"].upper(),
"SPARE" not in i["description"].upper(),
"NON-OPERATIONAL" not in i["description"].upper(),
"RESERVED" not in i["description"].upper(),
"TEST" not in i["description"].upper(),
"dsc." not in i["name"].lower(),
"fxp" not in i["name"].lower(),
)
)
)
)
def main():
setup_logging()
config = load(config_file=pathlib.Path(CONFIG_FILE))
all_interfaces = get_relevant_interfaces(config["inventory"])
client = influx_client(config["influx"])
with client:
all_error_counters = interface_errors(
client,
interface_info=all_interfaces,
errors=ERROR_FIELDS,
exclusions=config["exclude-interfaces"],
)
body = render_html(
all_error_counters,
date=datetime.utcnow().strftime("%a %d %b %H:%M:%S UTC %Y"),
)
# TODO: ensure data is from the day that we're interested in (today or yesterday)
# TODO: send script failures to admin email
if __name__ == "__main__":
main()