""" Report Interface Errors ======================= CLI tool for generating an interface error report and sending it by email to the OC. Every day an email may be sent that summarizes interface errors for all (*) GEANT routers and other network devices. The error report is sent as an html attachment to that email. First all relevant routers and interfaces are requested from Inventory Provider. Then InfluxDB is queried for the latest, and yesterday's, error measurement points. For every interface, the latest error counts are compared against yesterday's error count to determine whether it has suffered new errors. Currently the following errors are checked: * ``framing-errors`` * ``bit-error-seconds`` * ``errored-blocks-seconds`` * ``input-crc-errors`` * ``input-total-errors`` * ``input-discards`` * ``input-drops`` * ``output-drops`` For every interface with new errors is added to a summary report. This report is then sent to the OC. .. code-block:: bash Usage: report-interface-errors [OPTIONS] Options: --config PATH Path to a config file for this tool. The schema this config file must adhere to can be found in ``brian_polling_manager.error_report.config.ERROR_REPORT_CONFIG_SCHEMA`` [2024-04-09] This tool is the successor of a bash-script that was used before. That script has some peculiarities in it's output and as of this new version mimics the output of the earlier tool as much as possible. *) There are some rules which routers/interfaces to include and exclude. See the `get_relevant_interfaces`_ function for more details. """ from datetime import datetime import json import logging import os import pathlib from typing import Sequence from brian_polling_manager.influx import influx_client from brian_polling_manager.inventory import load_interfaces import click from influxdb import InfluxDBClient from brian_polling_manager.error_report.config import load from brian_polling_manager.error_report.report import ( render_email, render_html, send_email, ) logger = logging.getLogger(__name__) # The error field names in the influx query vs their reporting name ERROR_FIELDS = { "last_input_framing_errors": "framing-errors", "last_bit_error_seconds": "bit-error-seconds", "last_errored_blocks_seconds": "errored-blocks-seconds", "last_input_crc_errors": "input-crc-errors", "last_input_total_errors": "input-total-errors", "last_input_discards": "input-discards", "last_input_drops": "input-drops", "last_output_drops": "output-drops", } INFLUX_TIME_WINDOW_TODAY = "time > now() - 1d" INFLUX_TIME_WINDOW_YESTERDAY = "time < now() - 1d and time > now() - 2d" PROCESSED_ERROR_COUNTERS_SCHEMA = { "$schema": "https://json-schema.org/draft/2020-12/schema", "definitions": { "error_counters_content": { "type": "object", "properties": { "framing-errors": {"type": "integer"}, "bit-error-seconds": {"type": "integer"}, "errored-blocks-seconds": {"type": "integer"}, "input-crc-errors": {"type": "integer"}, "input-total-errors": {"type": "integer"}, "input-discards": {"type": "integer"}, "input-drops": {"type": "integer"}, "output-drops": {"type": "integer"}, }, "additionalProperties": False, }, "interface_error_counters": { "type": "object", "properties": { "router": {"type": "string"}, "interface": {"type": "string"}, "description": {"type": "string"}, "error_counters": {"$ref": "#/definitions/error_counters_content"}, "diff": {"$ref": "#/definitions/error_counters_content"}, }, "required": [ "router", "interface", "description", "error_counters", ], "additionalProperties": False, }, "excluded_interface_error_counters": { "type": "object", "properties": { "router": {"type": "string"}, "interface": {"type": "string"}, "description": {"type": "string"}, "error_counters": {"$ref": "#/definitions/error_counters_content"}, }, "required": [ "router", "interface", "description", "error_counters", ], "additionalProperties": False, }, }, "type": "object", "properties": { "interfaces": { "type": "array", "items": {"$ref": "#/definitions/interface_error_counters"}, }, "excluded_interfaces": { "type": "array", "items": {"$ref": "#/definitions/excluded_interface_error_counters"}, }, }, "required": ["interfaces", "excluded_interfaces"], "additionalProperties": False, } LOGGING_DEFAULT_CONFIG = { "version": 1, "disable_existing_loggers": False, "formatters": {"simple": {"format": "%(asctime)s - %(levelname)s - %(message)s"}}, "handlers": { "console": { "class": "logging.StreamHandler", "level": "INFO", "formatter": "simple", "stream": "ext://sys.stdout", }, }, "loggers": { "brian_polling_manager": { "level": "INFO", "handlers": ["console"], "propagate": False, } }, "root": {"level": "INFO", "handlers": ["console"]}, } def setup_logging(): """ set up logging using the configured filename if LOGGING_CONFIG is defined in the environment, use this for the filename, otherwise use LOGGING_DEFAULT_CONFIG """ logging_config = LOGGING_DEFAULT_CONFIG if "LOGGING_CONFIG" in os.environ: filename = os.environ["LOGGING_CONFIG"] with open(filename) as f: logging_config = json.loads(f.read()) logging.config.dictConfig(logging_config) def get_error_points(client: InfluxDBClient, time_window: str): """Get the last value for every error field for every (router, interface) :param client: an `InfluxDBCLient`_ :param time_window: an influx time window such as `INFLUX_TIME_WINDOW_TODAY` or `INFLUX_TIME_WINDOW_YESTERDAY_ :returns: a dict {(router, interface): error_point } were error_point is a dict with all the error field values for that respective interface """ raw_data = client.query( # This query may actually return values from mulitple different points if # some values are missing for the last point. But it's close enough. ( f"SELECT last(*) FROM errors WHERE {time_window} " "group by hostname, interface_name;" ) ) return { (tags["hostname"], tags["interface_name"]): next(points, {}) for (_, tags), points in raw_data.items() } def select_error_fields(errors, mapping): """Create a dictionary with every target key from `mapping`_ and its corresponding value from the `errors`_ dictionary, or ``0`` if it doesn't exist or has a ``None`` value :param errors: An error point dictionary coming from Influx :param mapping: A field name mapping {source: target} (ie. `ERROR_FIELDS`) for translating field names from the influx query to their error names in the report :returns: A new dictionary containing all relevant error counts """ # the extra `or 0` is for substituting None values return {tgt: errors.get(src, 0) or 0 for src, tgt in mapping.items()} def interface_errors( error_points_today, error_points_yesterday, interface_info, errors, exclusions=(), raise_on_errors=False, ): """ Retrieves error counters from influx :param error_points_today: todays errors as a return value of ``get_error_points`` :param error_points_yesterday: yesterdays errors as a return value of ``get_error_points`` :param interface_info: a dict of {(router, interface): info_dict} with interface information coming from invprov (ie. the output from `get_relevant_interfaces`_) :param errors: A dict of (input_data_field: result_field) for every error to report on (see `ERROR_FIELDS`_) :param raise_on_errors: raise when certain exceptions occur (useful for testing) :result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA """ todays_data = { key: select_error_fields(val, mapping=errors) for key, val in error_points_today.items() } yesterdays_data = { key: select_error_fields(val, mapping=errors) for key, val in error_points_yesterday.items() } result = {"interfaces": [], "excluded_interfaces": []} for (router, ifc), info in interface_info.items(): try: today = todays_data[(router, ifc)] except KeyError: logger.error(f"{router} - {ifc} not found in influx data") if raise_on_errors: raise continue if not any(err > 0 for err in today.values()): # skip interfaces without any errors continue yesterday = yesterdays_data.get((router, ifc), {}) counters = { "router": router, "interface": ifc, "error_counters": today, "description": info["description"], } if not is_excluded_interface(info["description"], exclusions): nonzero_errors = {err: val for err, val in today.items() if val > 0} counters["error_counters"] = nonzero_errors if any(yesterday.values()): # we have existing errors # This is strictly not the most correct way to determine differences. # during the day the error count may have reset and diffs may actually # be negative, but we ignore those because that is (mostly) how it was # done in the orginal bash script diff = { err: (val - yesterday[err]) for err, val in nonzero_errors.items() if (val - yesterday[err]) > 0 } if not diff: # Skip interface if it does not have any increased error counters continue counters["diff"] = diff result["interfaces"].append(counters) else: logger.info(f"Found excluded interface {router} - {ifc}") result["excluded_interfaces"].append(counters) return result def is_excluded_interface(description: str, exclusions: Sequence[str]): """Some interfaces generate a lot of noise and should be excluded""" # We may want to put this logic inside inventory provider return any(excl.lower() in description.lower() for excl in exclusions) def get_relevant_interfaces(hosts): """Get interface info from inventory provider. Some interfaces are considered irrelevant based on their description""" return _filter_and_sort_interfaces(load_interfaces(hosts)) def _filter_and_sort_interfaces(interfaces): # We may want to put this logic inside inventory provider and serve from a new # endpoint return dict( sorted( ((i["router"], i["name"]), i) for i in interfaces if all( ( "PHY" in i["description"].upper(), "SPARE" not in i["description"].upper(), "NON-OPERATIONAL" not in i["description"].upper(), "RESERVED" not in i["description"].upper(), "TEST" not in i["description"].upper(), "dsc." not in i["name"].lower(), "fxp" not in i["name"].lower(), ) ) ) ) def main(config: dict): """Main function for the error reporting script :param config: An instance of `ERROR_REPORT_CONFIG_SCHEMA` """ logger.info(f"Retrieving interfaces from inventory provider: {config['inventory']}") all_interfaces = get_relevant_interfaces(config["inventory"]) client = influx_client(config["influx"]) with client: logger.info("Retrieving error points from influxdb...") all_error_counters = interface_errors( error_points_today=get_error_points(client, INFLUX_TIME_WINDOW_TODAY), error_points_yesterday=get_error_points( client, INFLUX_TIME_WINDOW_YESTERDAY ), interface_info=all_interfaces, errors=ERROR_FIELDS, exclusions=config["exclude-interfaces"], ) logger.info("Generating report...") body = render_html( all_error_counters, date=datetime.utcnow().strftime("%a %d %b %H:%M:%S UTC %Y"), ) email = render_email(config["email"], html=body) logger.info("Sending email...") send_email(email, config=config["email"]) logger.info("Done!") @click.command() @click.option( "-c", "--config", type=click.Path( exists=True, file_okay=True, dir_okay=False, readable=True, path_type=pathlib.Path, ), help="path to a config file", ) def cli(config): setup_logging() config = load(config_file=config) main(config) if __name__ == "__main__": cli()