Skip to content
Snippets Groups Projects
Select Git revision
  • 56821b2fd105f07a740561cebc8451857a4152a7
  • main default protected
  • test
  • uat.20230605160823
  • uat.latest
5 results

app.py

Blame
  • cli.py 13.50 KiB
    """
    Report Interface Errors
    =======================
    
    CLI tool for generating an interface error report and sending it by email to the OC.
    
    Every day an email may be sent that summarizes interface errors for all (*) GEANT
    routers and other network devices. The error report is sent as an html attachment to
    that email. First all relevant routers and interfaces are requested from Inventory
    Provider. Then InfluxDB is queried for the latest, and yesterday's, error measurement
    points. For every interface, the latest error counts are compared against yesterday's
    error count to determine whether it has suffered new errors. Currently the following
    errors are checked:
    
        * ``bit-error-seconds`` (Juniper)
        * ``errored-blocks-seconds`` (Juniper)
        * ``framing-errors`` (Juniper)
        * ``input-crc-errors`` (Juniper)
        * ``input-total-errors`` (Juniper, Nokia)
        * ``input-discards``(Juniper, Nokia)
        * ``input-drops`` (Juniper)
        * ``output-discards`` (Nokia)
        * ``output-drops`` (Juniper)
        * ``oper-state-change-count`` (Nokia)
        * ``crc-align-errors`` (Nokia)
        * ``fcs errors`` (Nokia)
    
    For every interface with new errors is added to a summary report. This report is then
    sent to the OC.
    
    .. code-block:: bash
    
        Usage: report-interface-errors [OPTIONS]
    
        Options:
          --config PATH     Path to a config file for this tool. The schema this config
                            file must adhere to can be found in
                            ``brian_polling_manager.error_report.config.ERROR_REPORT_CONFIG_SCHEMA``
          --email/--no-email    Either send an email using the email config, or print to
                                stdout. Default (--email)
    
    *) There are some rules about which routers/interfaces to include and exclude. See the
    `get_relevant_interfaces`_ function for more details.
    """
    
    from datetime import datetime
    import logging
    import pathlib
    import sys
    from typing import Sequence, Tuple
    from brian_polling_manager.influx import influx_client
    from brian_polling_manager.inventory import (
        INVENTORY_INTERFACES_SCHEMA,
        load_inventory_json,
    )
    import click
    from influxdb import InfluxDBClient
    from brian_polling_manager.error_report.config import load
    from brian_polling_manager.error_report.report import (
        render_email,
        render_html,
        send_email,
    )
    
    logger = logging.getLogger(__name__)
    
    DEFAULT_INTERFACES_URL = "/poller/error-report-interfaces"
    
    # The error field names in the influx query vs their reporting name
    ERROR_FIELDS = {
        "last_bit_error_seconds": "bit-error-seconds",
        "last_errored_blocks_seconds": "errored-blocks-seconds",
        "last_input_framing_errors": "framing-errors",
        "last_input_crc_errors": "input-crc-errors",
        "last_input_discards": "input-discards",
        "last_input_drops": "input-drops",
        "last_input_total_errors": "input-total-errors",
        "last_output_discards": "output-discards",
        "last_output_drops": "output-drops",
        "last_output_total_errors": "output-total-errors",
        "last_oper_state_change_count": "oper-state-change-count",
        "last_crc_align_errors": "crc-align-errors",
        "last_fcs_errors": "fcs-errors",
    }
    
    INFLUX_TIME_WINDOW_TODAY = "time > now() - 1d"
    INFLUX_TIME_WINDOW_YESTERDAY = "time < now() - 1d and time > now() - 2d"
    
    PROCESSED_ERROR_COUNTERS_SCHEMA = {
        "$schema": "https://json-schema.org/draft/2020-12/schema",
        "definitions": {
            "error_counters_content": {
                "type": "object",
                "properties": {
                    "bit-error-seconds": {"type": "integer"},
                    "errored-blocks-seconds": {"type": "integer"},
                    "framing-errors": {"type": "integer"},
                    "input-crc-errors": {"type": "integer"},
                    "input-discards": {"type": "integer"},
                    "input-drops": {"type": "integer"},
                    "input-total-errors": {"type": "integer"},
                    "output-discards": {"type": "integer"},
                    "output-drops": {"type": "integer"},
                    "output-total-errors": {"type": "integer"},
                    "oper-state-change-count": {"type": "integer"},
                    "crc-align-errors": {"type": "integer"},
                    "fcs-errors": {"type": "integer"},
                },
                "additionalProperties": False,
            },
            "interface_error_counters": {
                "type": "object",
                "properties": {
                    "router": {"type": "string"},
                    "interface": {"type": "string"},
                    "description": {"type": "string"},
                    "errors_today": {"$ref": "#/definitions/error_counters_content"},
                    "errors_yesterday": {"$ref": "#/definitions/error_counters_content"},
                    "diff": {"$ref": "#/definitions/error_counters_content"},
                },
                "required": [
                    "router",
                    "interface",
                    "description",
                    "errors_today",
                    "errors_yesterday",
                    "diff",
                ],
                "additionalProperties": False,
            },
            "excluded_interface_error_counters": {
                "type": "object",
                "properties": {
                    "router": {"type": "string"},
                    "interface": {"type": "string"},
                    "description": {"type": "string"},
                },
                "required": [
                    "router",
                    "interface",
                    "description",
                ],
                "additionalProperties": False,
            },
        },
        "type": "object",
        "properties": {
            "interfaces": {
                "type": "array",
                "items": {"$ref": "#/definitions/interface_error_counters"},
            },
            "excluded_interfaces": {
                "type": "array",
                "items": {"$ref": "#/definitions/excluded_interface_error_counters"},
            },
        },
        "required": ["interfaces", "excluded_interfaces"],
        "additionalProperties": False,
    }
    
    
    class MessageCountingLogHandler(logging.NullHandler):
        def __init__(self, level=logging.NOTSET) -> None:
            super().__init__(level)
            self.count = 0
    
        def handle(self, _) -> None:
            self.count += 1
    
    
    def setup_logging(debug=False) -> MessageCountingLogHandler:
        """
        :param debug: Boolean. Set log level to DEBUG if True, or INFO otherwise
        :returns: a MessageCounter object that tracks error log messages
        """
    
        level = logging.DEBUG if debug else logging.INFO
        counter = MessageCountingLogHandler(level=logging.ERROR)
        stream_handler = logging.StreamHandler(sys.stderr)
        stream_handler.setLevel(level)
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(message)s",
            level=level,
            handlers=[counter, stream_handler],
        )
        return counter
    
    
    def get_error_points(client: InfluxDBClient, time_window: str):
        """Get the last value for every error field for every (router, interface)
    
        :param client: an `InfluxDBCLient`_
        :param time_window: an influx time window such as `INFLUX_TIME_WINDOW_TODAY` or
            `INFLUX_TIME_WINDOW_YESTERDAY_
        :returns: a dict {(router, interface): error_point }  were error_point is a dict
            with all the error field values for that respective interface
        """
        raw_data = client.query(
            # This query may actually return values from mulitple different points if
            # some values are missing for the last point. But it's close enough.
            (
                f"SELECT last(*) FROM errors WHERE {time_window} "
                "group by hostname, interface_name;"
            )
        )
        return {
            (tags["hostname"], tags["interface_name"]): next(points, {})
            for (_, tags), points in raw_data.items()
        }
    
    
    def select_error_fields(errors, mapping):
        """Create a dictionary with every target key from `mapping`_ and its corresponding
        value from the `errors`_ dictionary, or ``0`` if it doesn't exist or has a ``None``
        value
    
        :param errors: An error point dictionary coming from Influx
        :param mapping: A field name mapping {source: target} (ie. `ERROR_FIELDS`) for
            translating field names from the influx query to their error names in the report
        :returns: A new dictionary containing all relevant error counts
        """
        # the extra `or 0` is for substituting None values
        return {tgt: errors.get(src, 0) or 0 for src, tgt in mapping.items()}
    
    
    def interface_errors(
        error_points_today,
        error_points_yesterday,
        interface_info,
        errors,
        exclusions=(),
    ):
        """
        Retrieves error counters from influx
    
        :param error_points_today: todays errors as a return value of ``get_error_points``
        :param error_points_yesterday: yesterdays errors as a return value of
            ``get_error_points``
        :param interface_info: a dict of {(router, interface): info_dict} with interface
            information coming from invprov (ie. the output from `get_relevant_interfaces`_)
        :param errors: A dict of (input_data_field: result_field) for every error to report
            on (see `ERROR_FIELDS`_)
        :param raise_on_errors: raise when certain exceptions occur (useful for testing)
    
        :result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA
        """
        todays_data = {
            key: select_error_fields(val, mapping=errors)
            for key, val in error_points_today.items()
        }
    
        result = {"interfaces": [], "excluded_interfaces": []}
        new_errors = {}
    
        for (router, ifc), info in interface_info.items():
            error_info = {
                "router": router,
                "interface": ifc,
                "description": info["description"],
            }
    
            if is_excluded_interface(info, exclusions):
                logger.info(f"Found excluded interface {router} - {ifc}")
                result["excluded_interfaces"].append(error_info)
                continue
    
            try:
                today = todays_data[(router, ifc)]
            except KeyError:
                logger.error(f"{router} - {ifc} not found in influx data")
                continue
    
            today_nonzero = {err: val for err, val in today.items() if val > 0}
            if not today_nonzero:
                # skip interfaces without any errors
                continue
    
            yesterday = select_error_fields(
                error_points_yesterday.get((router, ifc), {}), mapping=errors
            )
    
            diff = {
                err: (val - yesterday[err])
                for err, val in today_nonzero.items()
                if (val - yesterday[err]) > 0
            }
            if not diff:
                # Skip interface if it does not have any increased error counters
                continue
    
            error_info["errors_today"] = today_nonzero
            error_info["diff"] = diff
            error_info["errors_yesterday"] = yesterday
    
            new_errors[(router, ifc)] = sum(diff.values())
            result["interfaces"].append(error_info)
    
        result["interfaces"].sort(
            key=lambda i: (
                -new_errors[(i["router"], i["interface"])],
                i["router"],
                i["interface"],
            ),
        )
        return result
    
    
    def is_excluded_interface(ifc, exclusions: Sequence[Tuple[str, str]]):
        """Some interfaces generate a lot of noise and should be excluded"""
        router, interface = ifc["router"], ifc["name"]
    
        return any(
            router.lower() == excl[0].lower() and interface.lower() == excl[1].lower()
            for excl in exclusions
        )
    
    
    def get_relevant_interfaces(config):
        """Get interface info from inventory provider. Some interfaces are considered
        irrelevant based on their description
        """
        url = config.get("inventory-url") or DEFAULT_INTERFACES_URL
        return _sort_interfaces(
            load_inventory_json(url, config["inventory"], INVENTORY_INTERFACES_SCHEMA)
        )
    
    
    def _sort_interfaces(interfaces):
        return dict(sorted(((i["router"], i["name"]), i) for i in interfaces))
    
    
    def main(config: dict, send_mail: bool):
        """Main function for the error reporting script
    
        :param config: An instance of `ERROR_REPORT_CONFIG_SCHEMA`
        """
        logger.info(f"Retrieving interfaces from inventory provider: {config['inventory']}")
    
        all_interfaces = get_relevant_interfaces(config)
        client = influx_client(config["influx"])
        with client:
            logger.info("Retrieving error points from influxdb...")
            all_error_counters = interface_errors(
                error_points_today=get_error_points(client, INFLUX_TIME_WINDOW_TODAY),
                error_points_yesterday=get_error_points(
                    client, INFLUX_TIME_WINDOW_YESTERDAY
                ),
                interface_info=all_interfaces,
                errors=ERROR_FIELDS,
                exclusions=config["exclude-interfaces"],
            )
        logger.info("Generating report...")
    
        body = render_html(
            all_error_counters,
            date=datetime.utcnow().strftime("%a %d %b %H:%M:%S UTC %Y"),
        )
        if not send_mail:
            click.echo(body)
            return
    
        email = render_email(config["email"], html=body)
        logger.info("Sending email...")
        send_email(email, config=config["email"])
        logger.info("Done!")
    
    
    @click.command()
    @click.option(
        "-c",
        "--config",
        type=click.Path(
            exists=True,
            file_okay=True,
            dir_okay=False,
            readable=True,
            path_type=pathlib.Path,
        ),
        help="path to a config file",
    )
    @click.option(
        "--email/--no-email",
        "send_mail",
        default=True,
        help="toggle sending out an email using the email configuration (--email), "
        "or printing the report to stdout (--no-email). Default: --email",
    )
    def cli(config, send_mail):
        error_counter = setup_logging()
        config = load(config_file=config)
        main(config, send_mail=send_mail)
        if error_counter.count > 0:
            raise click.ClickException(
                "Errors were encountered while generating error report"
            )
    
    
    if __name__ == "__main__":
        cli()