Skip to content
Snippets Groups Projects
Commit be07c2f6 authored by Pelle Koster's avatar Pelle Koster
Browse files

Support Nokia routers for error report & more

parent b25e4648
Branches
Tags
No related merge requests found
......@@ -12,14 +12,16 @@ points. For every interface, the latest error counts are compared against yester
error count to determine whether it has suffered new errors. Currently the following
errors are checked:
* ``framing-errors``
* ``bit-error-seconds``
* ``errored-blocks-seconds``
* ``input-crc-errors``
* ``input-total-errors``
* ``input-discards``
* ``input-drops``
* ``output-drops``
* ``bit-error-seconds`` (Juniper)
* ``errored-blocks-seconds`` (Juniper)
* ``framing-errors`` (Juniper)
* ``input-crc-errors`` (Juniper)
* ``input-total-errors`` (Juniper, Nokia)
* ``input-discards``(Juniper, Nokia)
* ``input-drops`` (Juniper)
* ``output-discards`` (Nokia)
* ``output-drops`` (Juniper)
* ``output-total-errors`` (Nokia)
For every interface with new errors is added to a summary report. This report is then
sent to the OC.
......@@ -60,14 +62,16 @@ logger = logging.getLogger(__name__)
# The error field names in the influx query vs their reporting name
ERROR_FIELDS = {
"last_input_framing_errors": "framing-errors",
"last_bit_error_seconds": "bit-error-seconds",
"last_errored_blocks_seconds": "errored-blocks-seconds",
"last_input_framing_errors": "framing-errors",
"last_input_crc_errors": "input-crc-errors",
"last_input_total_errors": "input-total-errors",
"last_input_discards": "input-discards",
"last_input_drops": "input-drops",
"last_input_total_errors": "input-total-errors",
"last_output_discards": "output-discards",
"last_output_drops": "output-drops",
"last_output_total_errors": "output-total-errors",
}
INFLUX_TIME_WINDOW_TODAY = "time > now() - 1d"
......@@ -79,14 +83,16 @@ PROCESSED_ERROR_COUNTERS_SCHEMA = {
"error_counters_content": {
"type": "object",
"properties": {
"framing-errors": {"type": "integer"},
"bit-error-seconds": {"type": "integer"},
"errored-blocks-seconds": {"type": "integer"},
"framing-errors": {"type": "integer"},
"input-crc-errors": {"type": "integer"},
"input-total-errors": {"type": "integer"},
"input-discards": {"type": "integer"},
"input-drops": {"type": "integer"},
"input-total-errors": {"type": "integer"},
"output-discards": {"type": "integer"},
"output-drops": {"type": "integer"},
"output-total-errors": {"type": "integer"},
},
"additionalProperties": False,
},
......@@ -96,14 +102,17 @@ PROCESSED_ERROR_COUNTERS_SCHEMA = {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
"errors_today": {"$ref": "#/definitions/error_counters_content"},
"errors_yesterday": {"$ref": "#/definitions/error_counters_content"},
"diff": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
"errors_today",
"errors_yesterday",
"diff",
],
"additionalProperties": False,
},
......@@ -113,13 +122,11 @@ PROCESSED_ERROR_COUNTERS_SCHEMA = {
"router": {"type": "string"},
"interface": {"type": "string"},
"description": {"type": "string"},
"error_counters": {"$ref": "#/definitions/error_counters_content"},
},
"required": [
"router",
"interface",
"description",
"error_counters",
],
"additionalProperties": False,
},
......@@ -140,12 +147,31 @@ PROCESSED_ERROR_COUNTERS_SCHEMA = {
}
def setup_logging():
class MessageCountingLogHandler(logging.NullHandler):
def __init__(self, level=logging.NOTSET) -> None:
super().__init__(level)
self.count = 0
def handle(self, _) -> None:
self.count += 1
def setup_logging(debug=False) -> MessageCountingLogHandler:
"""
:param debug: Boolean. Set log level to DEBUG if True, or INFO otherwise
:returns: a MessageCounter object that tracks error log messages
"""
level = logging.DEBUG if debug else logging.INFO
counter = MessageCountingLogHandler(level=logging.ERROR)
stream_handler = logging.StreamHandler(sys.stderr)
stream_handler.setLevel(level)
logging.basicConfig(
stream=sys.stderr,
level="INFO",
format="%(asctime)s - %(levelname)s - %(message)s",
level=level,
handlers=[counter, stream_handler],
)
return counter
def get_error_points(client: InfluxDBClient, time_window: str):
......@@ -191,7 +217,6 @@ def interface_errors(
interface_info,
errors,
exclusions=(),
raise_on_errors=False,
):
"""
Retrieves error counters from influx
......@@ -211,65 +236,52 @@ def interface_errors(
key: select_error_fields(val, mapping=errors)
for key, val in error_points_today.items()
}
yesterdays_data = {
key: select_error_fields(val, mapping=errors)
for key, val in error_points_yesterday.items()
}
result = {"interfaces": [], "excluded_interfaces": []}
new_errors = {}
for (router, ifc), info in interface_info.items():
error_info = {
"router": router,
"interface": ifc,
"description": info["description"],
}
if is_excluded_interface(info, exclusions):
logger.info(f"Found excluded interface {router} - {ifc}")
result["excluded_interfaces"].append(error_info)
continue
try:
today = todays_data[(router, ifc)]
except KeyError:
logger.error(f"{router} - {ifc} not found in influx data")
if raise_on_errors:
raise
continue
if not any(err > 0 for err in today.values()):
today_nonzero = {err: val for err, val in today.items() if val > 0}
if not today_nonzero:
# skip interfaces without any errors
continue
yesterday = yesterdays_data.get((router, ifc), {})
yesterday = select_error_fields(
error_points_yesterday.get((router, ifc), {}), mapping=errors
)
counters = {
"router": router,
"interface": ifc,
"error_counters": today,
"description": info["description"],
diff = {
err: (val - yesterday[err])
for err, val in today_nonzero.items()
if (val - yesterday[err]) > 0
}
if is_excluded_interface(info, exclusions):
logger.info(f"Found excluded interface {router} - {ifc}")
result["excluded_interfaces"].append(counters)
if not diff:
# Skip interface if it does not have any increased error counters
continue
nonzero_errors = {err: val for err, val in today.items() if val > 0}
counters["error_counters"] = nonzero_errors
new_errors[(router, ifc)] = sum(nonzero_errors.values())
error_info["errors_today"] = today_nonzero
error_info["diff"] = diff
error_info["errors_yesterday"] = yesterday
if any(yesterday.values()):
# we have existing errors
# This is strictly not the most correct way to determine differences.
# during the day the error count may have reset and diffs may actually
# be negative, but we ignore those because that is (mostly) how it was
# done in the orginal bash script
diff = {
err: (val - yesterday[err])
for err, val in nonzero_errors.items()
if (val - yesterday[err]) > 0
}
if not diff:
# Skip interface if it does not have any increased error counters
continue
counters["diff"] = diff
new_errors[(router, ifc)] = sum(diff.values())
result["interfaces"].append(counters)
new_errors[(router, ifc)] = sum(diff.values())
result["interfaces"].append(error_info)
result["interfaces"].sort(
key=lambda i: (
......@@ -377,9 +389,13 @@ def main(config: dict, send_mail: bool):
"or printing the report to stdout (--no-email). Default: --email",
)
def cli(config, send_mail):
setup_logging()
error_counter = setup_logging()
config = load(config_file=config)
main(config, send_mail=send_mail)
if error_counter.count > 0:
raise click.ClickException(
"Errors were encountered while generating error report"
)
if __name__ == "__main__":
......
......@@ -5,76 +5,85 @@
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>GÉANT Interface error report</title>
<style>
html {
font-family: monospace;
font-size: 12pt;
}
html {
font-family: monospace;
font-size: 12pt;
}
section, footer {
padding: 20pt 0 0 20pt;
}
section, footer {
margin: 20pt 0 0 20pt;
}
.interface-block {
padding-bottom: 20pt;
}
.interface-block {
margin-bottom: 20pt;
}
.interface-errors {
padding-left: 20pt;
}
.interface-errors {
margin-left: 20pt;
}
ul {
list-style-type: none;
}
.interface-list {
list-style-type: none;
}
h3, p, ul {
margin: 0pt;
}
h3, p, ul {
margin: 0pt;
}
.error-table {
padding-left: 20pt;
}
.error-table {
margin-left: 10pt;
}
td {
padding-right: 10pt;
}
th {
text-align: right;
font-weight: normal;
}
td.diff {
padding-left: 20pt;
}
td {
width: 80pt;
text-align: right;
margin-left: 20pt;
}
.first-column {
min-width: 150pt;
}
.yesterday {
color: #757575;
}
.diff {
font-weight: bold;
}
</style>
</head>
<body>
<section>
<div class="interface-report">
{%- for ifc in interfaces %}
<div class="interface-block">
<h3>{{ ifc.router }}</h3>
<div class="interface-errors">
<p><strong>{{ ifc.interface }}</strong> {{ ifc.description }}</p>
<table class="error-table">
{%- if ifc.diff %}
<section class="interface-report">
{%- for ifc in interfaces %}
<div class="interface-block">
<h3>{{ ifc.router }}<small> - {{ ifc.interface }}</small></h3>
<div class="interface-errors">
<p>{{ ifc.description }}</p>
<table class="error-table">
<tr>
<th class="first-column"></th>
<th class="yesterday">Yesterday</th>
<th>Today</th>
<th class="diff">Diff</th>
</tr>
{%- for err, diff in ifc.diff.items() %}
<tr>
<td>{{ err }}</td>
<td>{{ ifc.error_counters[err] }}</td>
<td class="diff">Diff:</td>
<td>{{ diff }}</td>
</tr>
{%- endfor %}
{%- else %}
{%- for err, val in ifc.error_counters.items() %}
<tr>
<td>{{ err }}</td>
<td>{{ val }}</td>
</tr>
<tr>
<td class="first-column">{{ err }}</td>
<td class="yesterday">{{ ifc.errors_yesterday[err] }}</td>
<td>{{ ifc.errors_today[err] }}</td>
<td class="diff">{{ diff }}</td>
</tr>
{%- endfor %}
{%- endif %}
</table>
</div>
</table>
</div>
{%- endfor %}
</div>
{%- endfor %}
</section>
{%- if excluded_interfaces %}
<section class="excluded">
......
......@@ -46,5 +46,14 @@
}
],
"snmp-index": 9999
},
{
"router": "rt0.ams.nl.geant.net",
"name": "lag-1",
"bundle": [],
"bundle-parents": [],
"description": "PHY nokia",
"circuits": [],
"snmp-index": 9999
}
]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>GÉANT Interface error report</title>
<style>
html {
font-family: monospace;
font-size: 12pt;
}
section, footer {
padding: 20pt 0 0 20pt;
}
.interface-block {
padding-bottom: 20pt;
}
.interface-errors {
padding-left: 20pt;
}
ul {
list-style-type: none;
}
h3, p, ul {
margin: 0pt;
}
.error-table {
padding-left: 20pt;
}
td {
padding-right: 10pt;
}
td.diff {
padding-left: 20pt;
}
</style>
</head>
<body>
<section>
<div class="interface-report">
<div class="interface-block">
<h3>mx1.ams.nl.geant.net</h3>
<div class="interface-errors">
<p><strong>ae1</strong> PHY blah blah</p>
<table class="error-table">
<tr>
<td>framing-errors</td>
<td>4</td>
<td class="diff">Diff:</td>
<td>2</td>
</tr>
<tr>
<td>input-drops</td>
<td>2</td>
<td class="diff">Diff:</td>
<td>1</td>
</tr>
</table>
</div>
</div>
<div class="interface-block">
<h3>mx1.fra.de.geant.net</h3>
<div class="interface-errors">
<p><strong>ae10</strong> PHY blah blah foo</p>
<table class="error-table">
<tr>
<td>input-drops</td>
<td>3</td>
</tr>
</table>
</div>
</div>
</div>
</section>
<footer>Generated SOME_DATE</footer>
</body>
</html>
\ No newline at end of file
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>GÉANT Interface error report</title>
<style>
html {
font-family: monospace;
font-size: 12pt;
}
section, footer {
padding: 20pt 0 0 20pt;
}
.interface-block {
padding-bottom: 20pt;
}
.interface-errors {
padding-left: 20pt;
}
ul {
list-style-type: none;
}
h3, p, ul {
margin: 0pt;
}
.error-table {
padding-left: 20pt;
}
td {
padding-right: 10pt;
}
td.diff {
padding-left: 20pt;
}
</style>
</head>
<body>
<section>
<div class="interface-report">
<div class="interface-block">
<h3>mx1.ams.nl.geant.net</h3>
<div class="interface-errors">
<p><strong>ae1</strong> PHY blah blah</p>
<table class="error-table">
<tr>
<td>input-drops</td>
<td>2</td>
</tr>
</table>
</div>
</div>
</div>
</section>
<section class="excluded">
<p>EXCLUDED INTERFACES</p>
<ul class="interface-list">
<li>mx1.fra.de.geant.net - ae10 - PHY blah blah foo</li>
</ul>
</section>
<footer>Generated SOME_DATE</footer>
</body>
</html>
\ No newline at end of file
......@@ -18,6 +18,7 @@ from brian_polling_manager.error_report.cli import (
INFLUX_TIME_WINDOW_TODAY,
INFLUX_TIME_WINDOW_YESTERDAY,
PROCESSED_ERROR_COUNTERS_SCHEMA,
MessageCountingLogHandler,
_filter_and_sort_interfaces,
get_error_points,
get_relevant_interfaces,
......@@ -247,6 +248,9 @@ def test_select_error_fields_skips_other_fields():
}
DEFAULT_ERRORS_YESTERDAY = select_error_fields({}, ERROR_FIELDS)
def test_interface_errors_with_new_errors(create_error_point, get_interface_errors):
create_error_point(
"mx1.ams.nl.geant.net",
......@@ -268,7 +272,7 @@ def test_interface_errors_with_new_errors(create_error_point, get_interface_erro
"router": "mx1.ams.nl.geant.net",
"interface": "ae1",
"description": "PHY blah blah",
"error_counters": {
"errors_today": {
"framing-errors": 1,
"bit-error-seconds": 2,
"errored-blocks-seconds": 3,
......@@ -278,6 +282,17 @@ def test_interface_errors_with_new_errors(create_error_point, get_interface_erro
"input-drops": 7,
"output-drops": 8,
},
"diff": {
"framing-errors": 1,
"bit-error-seconds": 2,
"errored-blocks-seconds": 3,
"input-crc-errors": 4,
"input-total-errors": 5,
"input-discards": 6,
"input-drops": 7,
"output-drops": 8,
},
"errors_yesterday": DEFAULT_ERRORS_YESTERDAY,
}
],
"excluded_interfaces": [],
......@@ -295,17 +310,21 @@ def test_interface_errors_with_multiple_interfaces(
"router": "mx1.ams.nl.geant.net",
"interface": "ae1",
"description": "PHY blah blah",
"error_counters": {
"errors_today": {
"framing-errors": 2,
},
"diff": {"framing-errors": 2},
"errors_yesterday": DEFAULT_ERRORS_YESTERDAY,
},
{
"router": "mx1.fra.de.geant.net",
"interface": "ae10",
"description": "PHY blah blah foo",
"error_counters": {
"errors_today": {
"framing-errors": 1,
},
"diff": {"framing-errors": 1},
"errors_yesterday": DEFAULT_ERRORS_YESTERDAY,
},
]
......@@ -318,6 +337,15 @@ def test_logs_message_on_missing_error_counters_for_interface(
assert "mx1.fra.de.geant.net - ae10 not found in influx data" in caplog.text
def test_does_not_log_message_for_missing_excluded_interface(
create_error_point, get_interface_errors, caplog
):
create_error_point("mx1.ams.nl.geant.net", "ae1", "today", framing_errors=0)
create_error_point("mx1.fra.de.geant.net", "ae10", "today", framing_errors=0)
get_interface_errors(exclusions=[("rt0.ams.nl.geant.net", "lag-1")])
assert not [r for r in caplog.records if r.levelname == "ERROR"]
def test_does_not_check_for_logical_interfaces(get_interface_errors, caplog):
get_interface_errors()
assert "mx1.fra.de.geant.net - ae99.1" not in caplog.text
......@@ -336,31 +364,21 @@ def test_interface_errors_doesnt_include_0_errors(
"mx1.ams.nl.geant.net", "ae1", "today", input_drops=1, framing_errors=0
)
errors = get_interface_errors()
assert errors["interfaces"][0] == {
"router": "mx1.ams.nl.geant.net",
"interface": "ae1",
"description": "PHY blah blah",
"error_counters": {
"input-drops": 1,
},
}
assert errors["interfaces"][0]["errors_today"] == {"input-drops": 1}
def test_increased_error_produces_diff(create_error_point, get_interface_errors):
def test_interface_errors_with_yesterday_errors(
create_error_point, get_interface_errors
):
create_error_point("mx1.ams.nl.geant.net", "ae1", "yesterday", input_drops=1)
create_error_point("mx1.ams.nl.geant.net", "ae1", "today", input_drops=10)
errors = get_interface_errors()
assert errors["interfaces"][0] == {
"router": "mx1.ams.nl.geant.net",
"interface": "ae1",
"description": "PHY blah blah",
"error_counters": {
"input-drops": 10,
},
"diff": {
"input-drops": 9,
},
assert errors["interfaces"][0]["diff"] == {"input-drops": 9}
assert errors["interfaces"][0]["errors_yesterday"] == {
**DEFAULT_ERRORS_YESTERDAY,
"input-drops": 1,
}
......@@ -398,32 +416,18 @@ def test_processes_excluded_interface(create_error_point, get_interface_errors):
)
create_error_point(
"mx1.fra.de.geant.net", "ae10", "today", input_drops=3, framing_errors=4
) # this interface is excluded through its description
)
errors = get_interface_errors(exclusions=[("mx1.fra.de.geant.net", "ae10")])
assert errors["interfaces"] == [
{
"router": "mx1.ams.nl.geant.net",
"interface": "ae1",
"description": "PHY blah blah",
"error_counters": {"input-drops": 1, "framing-errors": 2},
}
]
assert {(ifc["router"], ifc["interface"]) for ifc in errors["interfaces"]} == {
("mx1.ams.nl.geant.net", "ae1")
}
assert errors["excluded_interfaces"] == [
{
"router": "mx1.fra.de.geant.net",
"interface": "ae10",
"description": "PHY blah blah foo",
"error_counters": {
"input-drops": 3,
"framing-errors": 4,
"bit-error-seconds": 0,
"errored-blocks-seconds": 0,
"input-crc-errors": 0,
"input-total-errors": 0,
"input-discards": 0,
"output-drops": 0,
},
}
]
......@@ -474,10 +478,38 @@ def test_render_html(create_error_point, get_interface_errors):
create_error_point("mx1.fra.de.geant.net", "ae10", "today", input_drops=3)
errors = get_interface_errors()
result = render_html(errors=errors, date="SOME_DATE")
# The expected value contains mixed tabs and spaces. We put it in a separate file
# to comply with flake8
expected = (DATA_DIR / "test_render_html-expected.html").read_text()
assert result == expected
assert '<td class="first-column">input-drops</td>' in result
assert '<td class="first-column">framing-errors</td>' in result
def test_render_html_for_nokia(create_error_point, get_interface_errors):
create_error_point(
"rt0.ams.nl.geant.net",
"lag-1",
"yesterday",
input_total_errors=1,
input_discards=1,
output_total_errors=1,
output_discards=1,
)
create_error_point(
"rt0.ams.nl.geant.net",
"lag-1",
"today",
input_total_errors=2,
input_discards=2,
output_total_errors=2,
output_discards=2,
)
errors = get_interface_errors()
result = render_html(errors=errors, date="SOME_DATE")
assert '<td class="first-column">input-total-errors</td>' in result
assert '<td class="first-column">input-discards</td>' in result
assert '<td class="first-column">output-total-errors</td>' in result
assert '<td class="first-column">output-discards</td>' in result
def test_render_html_with_exclusions(create_error_point, get_interface_errors):
......@@ -491,11 +523,9 @@ def test_render_html_with_exclusions(create_error_point, get_interface_errors):
)
errors = get_interface_errors(exclusions=[("mx1.fra.de.geant.net", "ae10")])
result = render_html(errors=errors, date="SOME_DATE")
# The expected value contains mixed tabs and spaces. We put it in a separate file
# to comply with flake8
expected = (DATA_DIR / "test_render_html_with_exclusions-expected.html").read_text()
assert result == expected
assert "<p>EXCLUDED INTERFACES</p>" in result
assert "mx1.fra.de.geant.net - ae10 - PHY blah blah foo" in result
def test_render_email():
......@@ -615,14 +645,17 @@ def config_file(tmp_path):
"username": "some-username",
"password": "user-password",
},
"exclude-interfaces": [("mx1.fra.de.geant.net", "ae10")],
"exclude-interfaces": [
("mx1.fra.de.geant.net", "ae10"),
("rt0.ams.nl.geant.net", "lag-1"),
],
}
path = tmp_path / "config.json"
path.write_text(json.dumps(config))
return path
@patch.object(cli, "setup_logging")
@patch.object(cli, "setup_logging", return_value=MessageCountingLogHandler())
@patch.object(smtplib, "SMTP")
def test_e2e(
SMTP,
......@@ -633,7 +666,6 @@ def test_e2e(
create_error_point,
):
create_error_point("mx1.ams.nl.geant.net", "ae1", "today", input_drops=1)
with patch.object(
cli, "load_interfaces", return_value=small_inventory
), patch.object(cli, "influx_client", return_value=mock_influx_client):
......@@ -653,4 +685,17 @@ def test_e2e(
assert "mx1.ams.nl.geant.net" in report
assert "ae1" in report
assert "<td>input-drops</td>" in report
assert '<td class="first-column">input-drops</td>' in report
@patch.object(cli, "setup_logging", return_value=MessageCountingLogHandler())
@patch.object(cli, "main")
def test_e2e_with_errors(
unused_mainmain,
setup_logging,
config_file,
):
setup_logging().count += 1
runner = CliRunner()
result = runner.invoke(cli.cli, ["--config", str(config_file)])
assert result.exit_code == 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment