Skip to content
Snippets Groups Projects
Commit 65987a67 authored by Pelle Koster's avatar Pelle Koster
Browse files

WIP: initial work on new error report script

parent ade63e80
Branches
Tags
No related merge requests found
import logging
import pathlib
import pprint
from typing import Sequence, Set, Tuple
from brian_polling_manager.interface_stats.services import influx_client
from brian_polling_manager.inventory import load_interfaces
from influxdb import InfluxDBClient
from brian_polling_manager.error_report.config import load
logger = logging.getLogger(__name__)
SEND_FROM = "ne@geant.org"
REPLY_TO = "noreply@geant.org"
SEND_TO = "pelle.koster@geant.org"
CONFIG_FILE = "/home/pellek/develop/klanten/geant/error_report-config.json"
TIME_WINDOW_TODAY = "time > now() - 1d"
TIME_WINDOW_YESTERDAY = "time > now() - 2d and time < now() - 1d"
# The desired error fields vs their field name in the influx query
ERROR_FIELDS = [
("framing-errors", "last_input_framing_errors"),
("bit-error-seconds", "last_bit_error_seconds"),
("errored-blocks-seconds", "last_errored_blocks_seconds"),
("input-crc-errors", "last_input_crc_errors"),
("input-total-errors", "last_input_total_errors"),
("input-discards", "last_input_discards"),
("input-drops", "last_input_drops"),
("output-drops", "last_output_drops"),
]
PROCESSED_ERROR_COUNTERS_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"definitions": {
"error_counters_content": {
"type": "object",
"properties": {"additionalProperties": {"type": "integer"}},
},
"interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
"diff": {"$ref": "#/definitions/error_counters_content"},
"has_new_errors": {"type": "boolean"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
"diff",
"has_new_errors",
],
},
"excluded_interface_error_counters": {
"type": "object",
"properties": {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
],
},
},
"type": "object",
"properties": {
"counters": {
"type": "array",
"item": {"$ref": "#/definitions/interface_error_counters"},
},
"excluded_counters": {
"type": "array",
"item": {"$ref": "#/definitions/excluded_interface_error_counters"},
},
},
"required": ["counters", "excluded_counters"],
"additionalProperties": False,
}
email_config = {
"counters": [
{
"router": "rt1.some.router.geant.net",
"interface": "ef-0/1/1",
"description": "SOME DESCRIPTION",
"error_counters": {"output-drops": 1},
"diff": {"output-drops": 1},
"has_new_errors": True,
},
],
"excluded_counters": [
{
"router": ...,
"interface": ...,
"description": ...,
"error_counters": ...,
},
],
}
def get_error_points(client: InfluxDBClient, time_window: str):
raw_data = client.query(
# This query may actually return values from mulitple different points if
# some values are missing for the last point. But it's close enough.
(
"SELECT last(*) FROM errors "
f"WHERE {time_window} "
"group by hostname, interface_name order by time desc;"
)
)
return {
(tags["hostname"], tags["interface_name"]): next(points)
for (_, tags), points in raw_data.items()
}
def interface_errors(
client: InfluxDBClient, interface_info, errors, raise_on_errors=False
):
"""
Retrieves error counters from influx
:param client: InfluxDBClient for connecting to influx
:param interface_info: a dict of {(router, interface): info_dict} with interface
information coming from invprov (ie. the output from `get_relevant_interfaces`_)
:param errors: A list of (result_field, input_data_field) for every error to report
on (see `ERROR_FIELDS`_)
:param raise_on_errors: raise when certain exceptions occur (useful for testing)
:result: an instance of PROCESSED_ERROR_COUNTERS_SCHEMA
"""
todays_data = get_error_points(client, TIME_WINDOW_TODAY)
yesterdays_data = get_error_points(client, TIME_WINDOW_YESTERDAY)
result = {"interfaces": [], "excluded_interfaces": []}
exception_count = 0
for (router, ifc), today in todays_data.items():
yesterday = yesterdays_data.get((router, ifc), {})
try:
info = interface_info[(router, ifc)]
except KeyError:
logger.exception(f"{router} - {ifc} not found in inventory provider")
exception_count += 1
if raise_on_errors:
raise
counters = {
"error_counters": {err[0]: (today[err[1]] or 0) for (err) in errors},
"router": router,
"interface": ifc,
"description": info["description"],
}
if not is_excluded_interface(info["description"]):
counters["diff"] = {
err[0]: (today[err[1]] or 0) - (yesterday.get(err[1], 0) or 0)
for err in errors
}
# This is strictly not the most correct way to determine whether we have
# new errors (During the day the error count may have reset, some diffs may
# actually be negative), but it is (mostly) how it was determined previously
counters["has_new_errors"] = bool(
sum(v for v in counters["diff"].values() if v > 0)
)
result["interfaces"].append(counters)
else:
result["excluded_interfaces"].append(counters)
return result, exception_count
def is_excluded_interface(description: str, exclusions: Sequence[str]):
"""Some interfaces generate a lot of noise and should be excluded"""
# We may want to put this logic inside inventory provider
return any(excl.lower() in description.lower() for excl in exclusions)
def get_relevant_interfaces(hosts, load_interfaces_=load_interfaces):
"""Get interface info from inventory provider. Some interfaces are not considered
based on there description"""
# We may want to put this logic inside inventory provider and serve from a new
# endpoint
return {
(i["router"], i["name"]): i
for i in load_interfaces_(hosts)
if all(
(
"PHY" in i["description"].upper(),
"SPARE" not in i["description"].upper(),
"NON-OPERATIONAL" not in i["description"].upper(),
"RESERVED" not in i["description"].upper(),
"TEST" not in i["description"].upper(),
)
)
}
def main():
config = load(config_file=pathlib.Path(CONFIG_FILE))
client = influx_client(config["influx"])
all_interfaces = get_relevant_interfaces(config["inventory"])
with client:
all_error_counters = interface_errors(client, errors=ERROR_FIELDS)
for key in sorted(all_interfaces):
if key not in all_error_counters:
print(f"interface {key} not found in influx data")
continue
errors = all_error_counters[key]
if any(v > 0 for v in errors.values()):
print(*key, ",".join(str(c) for c in errors.values()))
# TODO: ensure data is from the day that we're interested in (today or yesterday)
# TODO: send script failures to admin email
if __name__ == "__main__":
main()
{
"email": {
"from": "noreply@geant.org",
"reply-to": "noreply@geant.org",
"to": "some-bogus-email",
"contact": "someone@geant.org / NE team"
},
"inventory": ["blah"],
"influx": {
"hostname": "hostname",
"database": "dbname",
"measurement": "errors",
"username": "some-username",
"password": "user-password"
},
"exclude-interfaces": [
"SOME DESCRIPTION PART"
]
}
import json
import logging.config
import pathlib
import jsonschema
logger = logging.getLogger(__name__)
CONFIG_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"definitions": {
"email-params": {
"type": "object",
"properties": {
"from": {"type": "string"},
"reply-to": {"type": "string"},
"to": {"type": "string"},
"contact": {"type": "string"},
},
"additionalProperties": False,
},
"influx-db-measurement": {
"type": "object",
"properties": {
"ssl": {"type": "boolean"},
"hostname": {"type": "string"},
"port": {"type": "integer"},
"username": {"type": "string"},
"password": {"type": "string"},
"database": {"type": "string"},
"measurement": {"type": "string"},
},
"required": [
# ssl, port are optional
"hostname",
"username",
"password",
"database",
"measurement",
],
"additionalProperties": False,
},
},
"type": "object",
"properties": {
"email": {"$ref": "#/definitions/email-params"},
"inventory": {
"type": "array",
"items": {"type": "string", "format": "uri"},
"minItems": 1,
},
"influx": {"$ref": "#/definitions/influx-db-measurement"},
"exclude-interfaces": {
"type": "array",
"items": {"type": "string"},
},
},
"required": ["email", "influx"],
"additionalProperties": False,
}
def load(config_file: pathlib.Path):
"""
loads, validates and returns configuration parameters
:param config_file: filename (file-like object, opened for reading)
:return: a dict containing configuration parameters
:raises: json.JSONDecodeError, jsonschema.ValidationError
"""
config = json.loads(config_file.read_text())
jsonschema.validate(config, CONFIG_SCHEMA)
return config
<html>
<body>
<pre>
{%- if interfaces %}
{%- for ifc in interfaces %}
=================================
{{ ifc.router }}
=================================
{{ ifc.interface }} {{ ifc.description }}
{%- for %}
output-drops 1746 Diff: 626
{%- endfor %}
</pre>
</body>
</html>
def send_email(errors, config):
pass
def render_body(errors, template_file):
pass
if __name__ == "__main__":
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment