diff --git a/brian_polling_manager/error_report/__init__.py b/brian_polling_manager/error_report/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/brian_polling_manager/error_report/cli.py b/brian_polling_manager/error_report/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b049196c676c910dd1850b4b0293aedbb71ddf --- /dev/null +++ b/brian_polling_manager/error_report/cli.py @@ -0,0 +1,231 @@ +import logging +import pathlib +import pprint +from typing import Sequence, Set, Tuple +from brian_polling_manager.interface_stats.services import influx_client +from brian_polling_manager.inventory import load_interfaces +from influxdb import InfluxDBClient +from brian_polling_manager.error_report.config import load + +logger = logging.getLogger(__name__) + +SEND_FROM = "ne@geant.org" +REPLY_TO = "noreply@geant.org" +SEND_TO = "pelle.koster@geant.org" +CONFIG_FILE = "/home/pellek/develop/klanten/geant/error_report-config.json" + +TIME_WINDOW_TODAY = "time > now() - 1d" +TIME_WINDOW_YESTERDAY = "time > now() - 2d and time < now() - 1d" + +# The desired error fields vs their field name in the influx query +ERROR_FIELDS = [ + ("framing-errors", "last_input_framing_errors"), + ("bit-error-seconds", "last_bit_error_seconds"), + ("errored-blocks-seconds", "last_errored_blocks_seconds"), + ("input-crc-errors", "last_input_crc_errors"), + ("input-total-errors", "last_input_total_errors"), + ("input-discards", "last_input_discards"), + ("input-drops", "last_input_drops"), + ("output-drops", "last_output_drops"), +] + +PROCESSED_ERROR_COUNTERS_SCHEMA = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "definitions": { + "error_counters_content": { + "type": "object", + "properties": {"additionalProperties": {"type": "integer"}}, + }, + "interface_error_counters": { + "type": "object", + "properties": { + "router": {"type": "string"}, + "interface": {"type": "string"}, + "description": {"type": "string"}, + "error_counters": {"$ref": "#/definitions/error_counters_content"}, + "diff": {"$ref": "#/definitions/error_counters_content"}, + "has_new_errors": {"type": "boolean"}, + }, + "required": [ + "router", + "interface", + "description", + "error_counters", + "diff", + "has_new_errors", + ], + }, + "excluded_interface_error_counters": { + "type": "object", + "properties": { + "router": {"type": "string"}, + "interface": {"type": "string"}, + "description": {"type": "string"}, + "error_counters": {"$ref": "#/definitions/error_counters_content"}, + }, + "required": [ + "router", + "interface", + "description", + "error_counters", + ], + }, + }, + "type": "object", + "properties": { + "counters": { + "type": "array", + "item": {"$ref": "#/definitions/interface_error_counters"}, + }, + "excluded_counters": { + "type": "array", + "item": {"$ref": "#/definitions/excluded_interface_error_counters"}, + }, + }, + "required": ["counters", "excluded_counters"], + "additionalProperties": False, +} + + +email_config = { + "counters": [ + { + "router": "rt1.some.router.geant.net", + "interface": "ef-0/1/1", + "description": "SOME DESCRIPTION", + "error_counters": {"output-drops": 1}, + "diff": {"output-drops": 1}, + "has_new_errors": True, + }, + ], + "excluded_counters": [ + { + "router": ..., + "interface": ..., + "description": ..., + "error_counters": ..., + }, + ], +} + + +def get_error_points(client: InfluxDBClient, time_window: str): + raw_data = client.query( + # This query may actually return values from mulitple different points if + # some values are missing for the last point. But it's close enough. + ( + "SELECT last(*) FROM errors " + f"WHERE {time_window} " + "group by hostname, interface_name order by time desc;" + ) + ) + return { + (tags["hostname"], tags["interface_name"]): next(points) + for (_, tags), points in raw_data.items() + } + + +def interface_errors( + client: InfluxDBClient, interface_info, errors, raise_on_errors=False +): + """ + Retrieves error counters from influx + + :param client: InfluxDBClient for connecting to influx + :param interface_info: a dict of {(router, interface): info_dict} with interface + information coming from invprov (ie. the output from `get_relevant_interfaces`_) + :param errors: A list of (result_field, input_data_field) for every error to report + on (see `ERROR_FIELDS`_) + :param raise_on_errors: raise when certain exceptions occur (useful for testing) + + :result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA + """ + todays_data = get_error_points(client, TIME_WINDOW_TODAY) + yesterdays_data = get_error_points(client, TIME_WINDOW_YESTERDAY) + + result = {"interfaces": [], "excluded_interfaces": []} + exception_count = 0 + for (router, ifc), today in todays_data.items(): + yesterday = yesterdays_data.get((router, ifc), {}) + try: + info = interface_info[(router, ifc)] + except KeyError: + logger.exception(f"{router} - {ifc} not found in inventory provider") + exception_count += 1 + if raise_on_errors: + raise + + counters = { + "error_counters": {err[0]: (today[err[1]] or 0) for (err) in errors}, + "router": router, + "interface": ifc, + "description": info["description"], + } + + if not is_excluded_interface(info["description"]): + counters["diff"] = { + err[0]: (today[err[1]] or 0) - (yesterday.get(err[1], 0) or 0) + for err in errors + } + # This is strictly not the most correct way to determine whether we have + # new errors (During the day the error count may have reset, some diffs may + # actually be negative), but it is (mostly) how it was determined previously + counters["has_new_errors"] = bool( + sum(v for v in counters["diff"].values() if v > 0) + ) + result["interfaces"].append(counters) + else: + result["excluded_interfaces"].append(counters) + + return result, exception_count + + +def is_excluded_interface(description: str, exclusions: Sequence[str]): + """Some interfaces generate a lot of noise and should be excluded""" + # We may want to put this logic inside inventory provider + return any(excl.lower() in description.lower() for excl in exclusions) + + +def get_relevant_interfaces(hosts, load_interfaces_=load_interfaces): + """Get interface info from inventory provider. Some interfaces are not considered + based on there description""" + + # We may want to put this logic inside inventory provider and serve from a new + # endpoint + return { + (i["router"], i["name"]): i + for i in load_interfaces_(hosts) + if all( + ( + "PHY" in i["description"].upper(), + "SPARE" not in i["description"].upper(), + "NON-OPERATIONAL" not in i["description"].upper(), + "RESERVED" not in i["description"].upper(), + "TEST" not in i["description"].upper(), + ) + ) + } + + +def main(): + config = load(config_file=pathlib.Path(CONFIG_FILE)) + client = influx_client(config["influx"]) + all_interfaces = get_relevant_interfaces(config["inventory"]) + with client: + all_error_counters = interface_errors(client, errors=ERROR_FIELDS) + for key in sorted(all_interfaces): + if key not in all_error_counters: + print(f"interface {key} not found in influx data") + continue + + errors = all_error_counters[key] + + if any(v > 0 for v in errors.values()): + print(*key, ",".join(str(c) for c in errors.values())) + + # TODO: ensure data is from the day that we're interested in (today or yesterday) + # TODO: send script failures to admin email + + +if __name__ == "__main__": + main() diff --git a/brian_polling_manager/error_report/config-example.json b/brian_polling_manager/error_report/config-example.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e23625b7e91eb139aa72f51e7a0672e94137a4 --- /dev/null +++ b/brian_polling_manager/error_report/config-example.json @@ -0,0 +1,19 @@ +{ + "email": { + "from": "noreply@geant.org", + "reply-to": "noreply@geant.org", + "to": "some-bogus-email", + "contact": "someone@geant.org / NE team" + }, + "inventory": ["blah"], + "influx": { + "hostname": "hostname", + "database": "dbname", + "measurement": "errors", + "username": "some-username", + "password": "user-password" + }, + "exclude-interfaces": [ + "SOME DESCRIPTION PART" + ] +} diff --git a/brian_polling_manager/error_report/config.py b/brian_polling_manager/error_report/config.py new file mode 100644 index 0000000000000000000000000000000000000000..7cad8f1fd25bfe131cdb39eaa3d08fe0a5b8b698 --- /dev/null +++ b/brian_polling_manager/error_report/config.py @@ -0,0 +1,75 @@ +import json +import logging.config +import pathlib + +import jsonschema + +logger = logging.getLogger(__name__) + +CONFIG_SCHEMA = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "definitions": { + "email-params": { + "type": "object", + "properties": { + "from": {"type": "string"}, + "reply-to": {"type": "string"}, + "to": {"type": "string"}, + "contact": {"type": "string"}, + }, + "additionalProperties": False, + }, + "influx-db-measurement": { + "type": "object", + "properties": { + "ssl": {"type": "boolean"}, + "hostname": {"type": "string"}, + "port": {"type": "integer"}, + "username": {"type": "string"}, + "password": {"type": "string"}, + "database": {"type": "string"}, + "measurement": {"type": "string"}, + }, + "required": [ + # ssl, port are optional + "hostname", + "username", + "password", + "database", + "measurement", + ], + "additionalProperties": False, + }, + }, + "type": "object", + "properties": { + "email": {"$ref": "#/definitions/email-params"}, + "inventory": { + "type": "array", + "items": {"type": "string", "format": "uri"}, + "minItems": 1, + }, + "influx": {"$ref": "#/definitions/influx-db-measurement"}, + "exclude-interfaces": { + "type": "array", + "items": {"type": "string"}, + }, + }, + "required": ["email", "influx"], + "additionalProperties": False, +} + + +def load(config_file: pathlib.Path): + """ + loads, validates and returns configuration parameters + + :param config_file: filename (file-like object, opened for reading) + :return: a dict containing configuration parameters + :raises: json.JSONDecodeError, jsonschema.ValidationError + """ + + config = json.loads(config_file.read_text()) + jsonschema.validate(config, CONFIG_SCHEMA) + + return config diff --git a/brian_polling_manager/error_report/email.jinja2 b/brian_polling_manager/error_report/email.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..3a2ebc950aea85b9eda2ec864ed97c0716d33867 --- /dev/null +++ b/brian_polling_manager/error_report/email.jinja2 @@ -0,0 +1,16 @@ +<html> +<body> +<pre> +{%- if interfaces %} +{%- for ifc in interfaces %} +================================= +{{ ifc.router }} +================================= + {{ ifc.interface }} {{ ifc.description }} + + {%- for %} + output-drops 1746 Diff: 626 +{%- endfor %} +</pre> +</body> +</html> diff --git a/brian_polling_manager/error_report/mailer.py b/brian_polling_manager/error_report/mailer.py new file mode 100644 index 0000000000000000000000000000000000000000..2b051ad89f136a5b70af9c9a56d0fd9134b76a49 --- /dev/null +++ b/brian_polling_manager/error_report/mailer.py @@ -0,0 +1,11 @@ + +def send_email(errors, config): + pass + +def render_body(errors, template_file): + + pass + +if __name__ == "__main__": + pass +