diff --git a/parse-logs.py b/parse-logs.py old mode 100644 new mode 100755 index c20975edd35c80ad025c937b6efbe71cca92016b..276b0cc2324cb9f0acb32af7625e2750e097d904 --- a/parse-logs.py +++ b/parse-logs.py @@ -1,15 +1,130 @@ +#!/bin/env python3 + import logging import json from datetime import datetime +''' +date="2023-08-07" +example_dict = { "ciao":0, + ... + } +json = { date:example_dict } +''' + +eccs_json_format = { + "IPUnique": 0, # All IP found on the logs + "IdPUnique": 0, # All requests containing the "idp=" parameter (API or WEB) + "RequestAPI": 0, # All requests containing "api" on the URL + "RequestWEB": 0, # All requests not containing "api" on the URL + "RegistrarUnique": 0, # All requests containing the "reg_auth=" parameter (API or WEB) + "Status": { # All requests contaning the "status=" parameter (API or WEB) + "OK": 0, # ? Why we neet to collect this value for 'status' parameter? + "ERROR": 0, # ? Why we neet to collect this value for 'status' parameter? + "UNKNOWN": 0, # ? Why we neet to collect this value for 'status' parameter? + "DISABLED": 0 # ? Why we neet to collect this value for 'status' parameter? + }, + "CountByRegistrar": { # This could be useful to understand who use the API/WEB (reg_auth) + "http://www.idem.garr.it/": { + "IPUnique": 0, + "IdPUnique": 0, + "RequestAPI": 0, + "RequestWEB": 0, + "Status": { + "OK": 0, # ? Why we neet to collect this value for 'status' parameter? + "ERROR": 0, # ? Why we neet to collect this value for 'status' parameter? + "UNKNOWN": 0, # ? Why we neet to collect this value for 'status' parameter? + "DISABLED": 0 # ? Why we neet to collect this value for 'status' parameter? + } + }, + "http://www.srce.hr": { + "IPUnique": 0, + "IdPUnique": 0, + "RequestAPI": 0, + "RequestWEB": 0, + "Status": { + "OK": 0, + "ERROR": 0, + "UNKNOWN": 0, + "DISABLED": 0 + } + } + }, + "CountByIPFromAPI": { # This could be useful to understand who use the API/WEB (by IP) + "4.4.4.4": { + "IdPUnique": 0, + "RegistrarUnique": 0, + "Status": { + "OK": 0, # ? Why we neet to collect this value for 'status' parameter? + "ERROR": 0, # ? Why we neet to collect this value for 'status' parameter? + "UNKNOWN": 0, # ? Why we neet to collect this value for 'status' parameter? + "DISABLED": 0 # ? Why we neet to collect this value for 'status' parameter? + } + }, + "8.8.8.8": { + "IdPUnique": 0, + "RegistrarUnique": 0, + "Status": { + "OK": 0, + "ERROR": 0, + "UNKNOWN": 0, + "DISABLED": 0 + } + } + }, + "countByHour": { # This could be useful to understand when automatic script run from federations + "17": { + "IPUnique": 0, + "IdPUnique": 0, + "RequestAPI": 0, + "RequestWEB": 0, + "Status": { + "OK": 0, # ? Why we neet to collect this value for 'status' parameter? + "ERROR": 0, # ? Why we neet to collect this value for 'status' parameter? + "UNKNOWN": 0, # ? Why we neet to collect this value for 'status' parameter? + "DISABLED": 0 # ? Why we neet to collect this value for 'status' parameter? + } + }, + "23": { + "IPUnique": 0, + "IdPUnique": 0, + "RequestAPI": 0, + "RequestWEB": 0, + "Status": { + "OK": 0, + "ERROR": 0, + "UNKNOWN": 0, + "DISABLED": 0 + } + } + } + } + +def get_url_info(url,parameter): + # url = /eccs/api/eccsresults?idp=https://garr-idp-prod.irccs.garr.it/idp/shibboleth + # parameter = 'idp' | 'reg_auth' + + list_info = url.split('?')[1].split('&') + + for info in list_info: + if (parameter in info): + return info.split('=')[1] + def main(): logging.basicConfig(filename='/home/eccs/logs/eccs-log-parsing.log', level=logging.INFO) logging.info('Started') file_name = "/home/eccs/logs/eccs-uwsgi.log" + #file_name = "/home/eccs/logs/example.log" file = open(file_name, "r") data = [] order = ["ip", "date", "http-method", "url"] + + list_ip = [] + list_idp = [] + list_reg_auth = [] + list_status = [] + eccs_stats = {} for line in file.readlines(): if ("|" not in line): continue @@ -19,9 +134,51 @@ def main(): details[1] = datetime.strptime(details[1], '[%a %b %d %H:%M:%S %Y]').strftime('%Y-%m-%d') structure = {key:value for key, value in zip(order, details)} data.append(structure) - + for entry in data: - print(json.dumps(entry, indent = 4)) + + eccs_stats[entry['date']] = eccs_json_format + + # IPUnique - global + if (entry['ip'] not in list_ip): + list_ip.append(entry['ip']) + eccs_stats[entry['date']]['IPUnique']+=1 + + # IdPUnique - global + if ('idp=' in entry['url']): + idp = get_url_info(entry['url'], 'idp') + if (idp not in list_idp): + list_idp.append(idp) + eccs_stats[entry['date']]['IdPUnique']+=1 + + # RequestAPI - global + if ('/eccs/api/' in entry['url']): eccs_stats[entry['date']]['RequestAPI']+=1 + else: eccs_stats[entry['date']]['RequestWEB']+=1 + + # RegistrarUnique - global + if ('reg_auth=' in entry['url']): + reg_auth = get_url_info(entry['url'], 'reg_auth') + if (reg_auth not in list_reg_auth): + list_reg_auth.append(reg_auth) + eccs_stats[entry['date']]['RegistrarUnique']+=1 + + # Status - global + if ('status' in entry['url']): + status = get_url_info(entry['url'], 'status') + if (status == 'OK'): eccs_stats[entry['date']]['Status']['OK']+=1 + if (status == 'ERROR'): eccs_stats[entry['date']]['Status']['ERROR']+=1 + if (status == 'UNKNOWN'): eccs_stats[entry['date']]['Status']['UNKNOWN']+=1 + if (status == 'DISABLE'): eccs_stats[entry['date']]['Status']['DISABLE']+=1 + + + #print(json.dumps(entry, indent = 4)) + #eccs_json_format['IPUnique'] = len(list_ip) + #eccs_json_format['IdPUnique'] = len(list_idp) + #eccs_json_format['RegistrarUnique'] = len(list_reg_auth) + + + print(eccs_stats) + logging.info('Finished')