Skip to content
Snippets Groups Projects
Commit 65987a67 authored by Pelle Koster's avatar Pelle Koster
Browse files

WIP: initial work on new error report script

parent ade63e80
No related branches found
No related tags found
No related merge requests found
import logging
import pathlib
import pprint
from typing import Sequence, Set, Tuple
from brian_polling_manager.interface_stats.services import influx_client
from brian_polling_manager.inventory import load_interfaces
from influxdb import InfluxDBClient
from brian_polling_manager.error_report.config import load
logger = logging.getLogger(__name__)
SEND_FROM = "ne@geant.org"
REPLY_TO = "noreply@geant.org"
SEND_TO = "pelle.koster@geant.org"
CONFIG_FILE = "/home/pellek/develop/klanten/geant/error_report-config.json"
TIME_WINDOW_TODAY = "time > now() - 1d"
TIME_WINDOW_YESTERDAY = "time > now() - 2d and time < now() - 1d"
# The desired error fields vs their field name in the influx query
ERROR_FIELDS = [
("framing-errors", "last_input_framing_errors"),
("bit-error-seconds", "last_bit_error_seconds"),
("errored-blocks-seconds", "last_errored_blocks_seconds"),
("input-crc-errors", "last_input_crc_errors"),
("input-total-errors", "last_input_total_errors"),
("input-discards", "last_input_discards"),
("input-drops", "last_input_drops"),
("output-drops", "last_output_drops"),
]
PROCESSED_ERROR_COUNTERS_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"definitions": {
"error_counters_content": {
"type": "object",
"properties": {"additionalProperties": {"type": "integer"}},
},
"interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
"diff": {"$ref": "#/definitions/error_counters_content"},
"has_new_errors": {"type": "boolean"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
"diff",
"has_new_errors",
],
},
"excluded_interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
],
},
},
"type": "object",
"properties": {
"counters": {
"type": "array",
"item": {"$ref": "#/definitions/interface_error_counters"},
},
"excluded_counters": {
"type": "array",
"item": {"$ref": "#/definitions/excluded_interface_error_counters"},
},
},
"required": ["counters", "excluded_counters"],
"additionalProperties": False,
}
email_config = {
"counters": [
{
"router": "rt1.some.router.geant.net",
"interface": "ef-0/1/1",
"description": "SOME DESCRIPTION",
"error_counters": {"output-drops": 1},
"diff": {"output-drops": 1},
"has_new_errors": True,
},
],
"excluded_counters": [
{
"router": ...,
"interface": ...,
"description": ...,
"error_counters": ...,
},
],
}
def get_error_points(client: InfluxDBClient, time_window: str):
raw_data = client.query(
# This query may actually return values from mulitple different points if
# some values are missing for the last point. But it's close enough.
(
"SELECT last(*) FROM errors "
f"WHERE {time_window} "
"group by hostname, interface_name order by time desc;"
)
)
return {
(tags["hostname"], tags["interface_name"]): next(points)
for (_, tags), points in raw_data.items()
}
def interface_errors(
client: InfluxDBClient, interface_info, errors, raise_on_errors=False
):
"""
Retrieves error counters from influx
:param client: InfluxDBClient for connecting to influx
:param interface_info: a dict of {(router, interface): info_dict} with interface
information coming from invprov (ie. the output from `get_relevant_interfaces`_)
:param errors: A list of (result_field, input_data_field) for every error to report
on (see `ERROR_FIELDS`_)
:param raise_on_errors: raise when certain exceptions occur (useful for testing)
:result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA
"""
todays_data = get_error_points(client, TIME_WINDOW_TODAY)
yesterdays_data = get_error_points(client, TIME_WINDOW_YESTERDAY)
result = {"interfaces": [], "excluded_interfaces": []}
exception_count = 0
for (router, ifc), today in todays_data.items():
yesterday = yesterdays_data.get((router, ifc), {})
try:
info = interface_info[(router, ifc)]
except KeyError:
logger.exception(f"{router} - {ifc} not found in inventory provider")
exception_count += 1
if raise_on_errors:
raise
counters = {
"error_counters": {err[0]: (today[err[1]] or 0) for (err) in errors},
"router": router,
"interface": ifc,
"description": info["description"],
}
if not is_excluded_interface(info["description"]):
counters["diff"] = {
err[0]: (today[err[1]] or 0) - (yesterday.get(err[1], 0) or 0)
for err in errors
}
# This is strictly not the most correct way to determine whether we have
# new errors (During the day the error count may have reset, some diffs may
# actually be negative), but it is (mostly) how it was determined previously
counters["has_new_errors"] = bool(
sum(v for v in counters["diff"].values() if v > 0)
)
result["interfaces"].append(counters)
else:
result["excluded_interfaces"].append(counters)
return result, exception_count
def is_excluded_interface(description: str, exclusions: Sequence[str]):
"""Some interfaces generate a lot of noise and should be excluded"""
# We may want to put this logic inside inventory provider
return any(excl.lower() in description.lower() for excl in exclusions)
def get_relevant_interfaces(hosts, load_interfaces_=load_interfaces):
"""Get interface info from inventory provider. Some interfaces are not considered
based on there description"""
# We may want to put this logic inside inventory provider and serve from a new
# endpoint
return {
(i["router"], i["name"]): i
for i in load_interfaces_(hosts)
if all(
(
"PHY" in i["description"].upper(),
"SPARE" not in i["description"].upper(),
"NON-OPERATIONAL" not in i["description"].upper(),
"RESERVED" not in i["description"].upper(),
"TEST" not in i["description"].upper(),
)
)
}
def main():
config = load(config_file=pathlib.Path(CONFIG_FILE))
client = influx_client(config["influx"])
all_interfaces = get_relevant_interfaces(config["inventory"])
with client:
all_error_counters = interface_errors(client, errors=ERROR_FIELDS)
for key in sorted(all_interfaces):
if key not in all_error_counters:
print(f"interface {key} not found in influx data")
continue
errors = all_error_counters[key]
if any(v > 0 for v in errors.values()):
print(*key, ",".join(str(c) for c in errors.values()))
# TODO: ensure data is from the day that we're interested in (today or yesterday)
# TODO: send script failures to admin email
if __name__ == "__main__":
main()
{
"email": {
"from": "noreply@geant.org",
"reply-to": "noreply@geant.org",
"to": "some-bogus-email",
"contact": "someone@geant.org / NE team"
},
"inventory": ["blah"],
"influx": {
"hostname": "hostname",
"database": "dbname",
"measurement": "errors",
"username": "some-username",
"password": "user-password"
},
"exclude-interfaces": [
"SOME DESCRIPTION PART"
]
}
import json
import logging.config
import pathlib
import jsonschema
logger = logging.getLogger(__name__)
CONFIG_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"definitions": {
"email-params": {
"type": "object",
"properties": {
"from": {"type": "string"},
"reply-to": {"type": "string"},
"to": {"type": "string"},
"contact": {"type": "string"},
},
"additionalProperties": False,
},
"influx-db-measurement": {
"type": "object",
"properties": {
"ssl": {"type": "boolean"},
"hostname": {"type": "string"},
"port": {"type": "integer"},
"username": {"type": "string"},
"password": {"type": "string"},
"database": {"type": "string"},
"measurement": {"type": "string"},
},
"required": [
# ssl, port are optional
"hostname",
"username",
"password",
"database",
"measurement",
],
"additionalProperties": False,
},
},
"type": "object",
"properties": {
"email": {"$ref": "#/definitions/email-params"},
"inventory": {
"type": "array",
"items": {"type": "string", "format": "uri"},
"minItems": 1,
},
"influx": {"$ref": "#/definitions/influx-db-measurement"},
"exclude-interfaces": {
"type": "array",
"items": {"type": "string"},
},
},
"required": ["email", "influx"],
"additionalProperties": False,
}
def load(config_file: pathlib.Path):
"""
loads, validates and returns configuration parameters
:param config_file: filename (file-like object, opened for reading)
:return: a dict containing configuration parameters
:raises: json.JSONDecodeError, jsonschema.ValidationError
"""
config = json.loads(config_file.read_text())
jsonschema.validate(config, CONFIG_SCHEMA)
return config
<html>
<body>
<pre>
{%- if interfaces %}
{%- for ifc in interfaces %}
=================================
{{ ifc.router }}
=================================
{{ ifc.interface }} {{ ifc.description }}
{%- for %}
output-drops 1746 Diff: 626
{%- endfor %}
</pre>
</body>
</html>
def send_email(errors, config):
pass
def render_body(errors, template_file):
pass
if __name__ == "__main__":
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment