Skip to content
Snippets Groups Projects
cli.py 10.36 KiB
from datetime import datetime
import json
import logging
import os
import pathlib
from typing import Sequence
from brian_polling_manager.interface_stats.services import influx_client
from brian_polling_manager.inventory import load_interfaces
from influxdb import InfluxDBClient
from brian_polling_manager.error_report.config import load
from brian_polling_manager.error_report.mailer import render_html

logger = logging.getLogger(__name__)

CONFIG_FILE = "/home/pellek/develop/klanten/geant/error_report-config.json"

INFLUX_TIME_WINDOW_TODAY = "time > now() - 1d"
INFLUX_TIME_WINDOW_YESTERDAY = "time < now() - 1d and time > now() - 2d"

# The error field names in the influx query vs their reporting name
ERROR_FIELDS = {
    "last_input_framing_errors": "framing-errors",
    "last_bit_error_seconds": "bit-error-seconds",
    "last_errored_blocks_seconds": "errored-blocks-seconds",
    "last_input_crc_errors": "input-crc-errors",
    "last_input_total_errors": "input-total-errors",
    "last_input_discards": "input-discards",
    "last_input_drops": "input-drops",
    "last_output_drops": "output-drops",
}


PROCESSED_ERROR_COUNTERS_SCHEMA = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "definitions": {
        "error_counters_content": {
            "type": "object",
            "properties": {
                "framing-errors": {"type": "integer"},
                "bit-error-seconds": {"type": "integer"},
                "errored-blocks-seconds": {"type": "integer"},
                "input-crc-errors": {"type": "integer"},
                "input-total-errors": {"type": "integer"},
                "input-discards": {"type": "integer"},
                "input-drops": {"type": "integer"},
                "output-drops": {"type": "integer"},
            },
            "additionalProperties": False,
        },
        "interface_error_counters": {
            "type": "object",
            "properties": {
                "router": {"type": "string"},
                "interface": {"type": "string"},
                "description": {"type": "string"},
                "error_counters": {"$ref": "#/definitions/error_counters_content"},
                "diff": {"$ref": "#/definitions/error_counters_content"},
            },
            "required": [
                "router",
                "interface",
                "description",
                "error_counters",
            ],
            "additionalProperties": False,
        },
        "excluded_interface_error_counters": {
            "type": "object",
            "properties": {
                "router": {"type": "string"},
                "interface": {"type": "string"},
                "description": {"type": "string"},
                "error_counters": {"$ref": "#/definitions/error_counters_content"},
            },
            "required": [
                "router",
                "interface",
                "description",
                "error_counters",
            ],
            "additionalProperties": False,
        },
    },
    "type": "object",
    "properties": {
        "interfaces": {
            "type": "array",
            "items": {"$ref": "#/definitions/interface_error_counters"},
        },
        "excluded_interfaces": {
            "type": "array",
            "items": {"$ref": "#/definitions/excluded_interface_error_counters"},
        },
    },
    "required": ["interfaces", "excluded_interfaces"],
    "additionalProperties": False,
}


LOGGING_DEFAULT_CONFIG = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {"simple": {"format": "%(asctime)s - %(levelname)s - %(message)s"}},
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "level": "INFO",
            "formatter": "simple",
            "stream": "ext://sys.stderr",
        },
    },
    "loggers": {
        "brian_polling_manager": {
            "level": "INFO",
            "handlers": ["console"],
            "propagate": False,
        }
    },
    "root": {"level": "INFO", "handlers": ["console"]},
}


def setup_logging():
    """
    set up logging using the configured filename

    if LOGGING_CONFIG is defined in the environment, use this for
    the filename, otherwise use LOGGING_DEFAULT_CONFIG
    """
    logging_config = LOGGING_DEFAULT_CONFIG
    if "LOGGING_CONFIG" in os.environ:
        filename = os.environ["LOGGING_CONFIG"]
        with open(filename) as f:
            logging_config = json.loads(f.read())

    logging.config.dictConfig(logging_config)


def get_error_points(client: InfluxDBClient, time_window: str):
    """Get the last value for every error field for every (router, interface)

    :param client: an `InfluxDBCLient`_
    :param time_window: an influx time window such as `INFLUX_TIME_WINDOW_TODAY` or
        `INFLUX_TIME_WINDOW_YESTERDAY_
    :returns: a dict {(router, interface): error_point }  were error_point is a dict
        with all the error field values for that respective interface
    """
    raw_data = client.query(
        # This query may actually return values from mulitple different points if
        # some values are missing for the last point. But it's close enough.
        (
            f"SELECT last(*) FROM errors WHERE {time_window} "
            "group by hostname, interface_name;"
        )
    )
    return {
        (tags["hostname"], tags["interface_name"]): next(points, {})
        for (_, tags), points in raw_data.items()
    }


def select_error_fields(errors, mapping):
    # the extra `or 0` is for substituting None values
    return {tgt: errors.get(src, 0) or 0 for src, tgt in mapping.items()}


def interface_errors(
    client: InfluxDBClient, interface_info, errors, exclusions=(), raise_on_errors=False
):
    """
    Retrieves error counters from influx

    :param client: InfluxDBClient for connecting to influx
    :param interface_info: a dict of {(router, interface): info_dict} with interface
        information coming from invprov (ie. the output from `get_relevant_interfaces`_)
    :param errors: A dict of (input_data_field: result_field) for every error to report
        on (see `ERROR_FIELDS`_)
    :param raise_on_errors: raise when certain exceptions occur (useful for testing)

    :result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA
    """
    todays_data = {
        key: select_error_fields(val, mapping=errors)
        for key, val in get_error_points(client, INFLUX_TIME_WINDOW_TODAY).items()
    }
    yesterdays_data = {
        key: select_error_fields(val, mapping=errors)
        for key, val in get_error_points(client, INFLUX_TIME_WINDOW_YESTERDAY).items()
    }

    result = {"interfaces": [], "excluded_interfaces": []}
    for (router, ifc), info in interface_info.items():
        try:
            today = todays_data[(router, ifc)]
        except KeyError:
            logger.error(f"{router} - {ifc} not found in influx data")
            if raise_on_errors:
                raise
            continue

        if not any(err > 0 for err in today.values()):
            # skip interfaces without any errors
            continue

        yesterday = yesterdays_data.get((router, ifc), {})

        counters = {
            "router": router,
            "interface": ifc,
            "error_counters": today,
            "description": info["description"],
        }

        if not is_excluded_interface(info["description"], exclusions):
            nonzero_errors = {err: val for err, val in today.items() if val > 0}
            counters["error_counters"] = nonzero_errors

            if any(yesterday.values()):
                # we have existing errors

                # This is strictly not the most correct way to determine differences.
                # during the day the error count may have reset and diffs may actually
                # be negative, but we ignore those because that is (mostly) how it was
                # done in the orginal bash script
                diff = {
                    err: (val - yesterday[err])
                    for err, val in nonzero_errors.items()
                    if (val - yesterday[err]) > 0
                }
                if not diff:
                    # Skip interface if it does not have any increased error counters
                    continue

                counters["diff"] = diff

            result["interfaces"].append(counters)
        else:
            logger.info(f"Found excluded interface {router} - {ifc}")
            result["excluded_interfaces"].append(counters)

    return result


def is_excluded_interface(description: str, exclusions: Sequence[str]):
    """Some interfaces generate a lot of noise and should be excluded"""
    # We may want to put this logic inside inventory provider
    return any(excl.lower() in description.lower() for excl in exclusions)


def get_relevant_interfaces(hosts):
    """Get interface info from inventory provider. Some interfaces are considered
    irrelevant based on there description"""

    return _filter_and_convert_interfaces(load_interfaces(hosts))


def _filter_and_convert_interfaces(interfaces):
    # We may want to put this logic inside inventory provider and serve from a new
    # endpoint
    return dict(
        sorted(
            ((i["router"], i["name"]), i)
            for i in interfaces
            if all(
                (
                    "PHY" in i["description"].upper(),
                    "SPARE" not in i["description"].upper(),
                    "NON-OPERATIONAL" not in i["description"].upper(),
                    "RESERVED" not in i["description"].upper(),
                    "TEST" not in i["description"].upper(),
                    "dsc." not in i["name"].lower(),
                    "fxp" not in i["name"].lower(),
                )
            )
        )
    )


def main():
    setup_logging()
    config = load(config_file=pathlib.Path(CONFIG_FILE))
    all_interfaces = get_relevant_interfaces(config["inventory"])
    client = influx_client(config["influx"])
    with client:
        all_error_counters = interface_errors(
            client,
            interface_info=all_interfaces,
            errors=ERROR_FIELDS,
            exclusions=config["exclude-interfaces"],
        )
        body = render_html(
            all_error_counters,
            date=datetime.utcnow().strftime("%a %d %b %H:%M:%S UTC %Y"),
        )


    # TODO: ensure data is from the day that we're interested in (today or yesterday)
    # TODO: send script failures to admin email


if __name__ == "__main__":
    main()