capacityreport.py

import os
import re
import argparse
import sys
import pandas as pd        # https://pandas.pydata.org
from datetime import datetime, timezone, timedelta
from pathlib import Path
import numpy as np

###############################################################################
# INPUT DATA SECTION
###############################################################################

# make sure this matches with the What-If Scenario runner script
RAW_REPORT_DIRECTORY="/Users/daniel.verlouw/Desktop/rawreports"
RAW_REPORT_FILE_PREFIX="raw_capacityreport_"
RAW_REPORT_FILE_SUFFIX="csv"

CONSOLIDATED_REPORT_DIRECTORY="/Users/daniel.verlouw/Desktop/consolidatedreports"
CONSOLIDATED_REPORT_FILE_PREFIX="consolidated_capacityreport_"
CONSOLIDATED_REPORT_FILE_SUFFIX_RAW="csv"
CONSOLIDATED_REPORT_FILE_SUFFIX_HUMAN="txt"

# Scenarios triggering a 'MUST UPGRADE' if threshold is exceeded
MUST_UPGRADE_SCENARIOS=["normal","onelinkfail","onenodefail"]
# Scenarios triggering a 'SHOULD UPGRADE' if threshold is exceeded
SHOULD_UPGRADE_SCENARIOS=["twolinkfail","nodelinkfail"]

# ISO 8601 basic timestamp format (YYYYMMDDTHHMMZ)
ISO8601_FORMAT = '%Y%m%dT%H%MZ'
ISO8601_REGEXP = r'\d{4}\d{2}\d{2}T\d{2}\d{2}Z'

###############################################################################
# RAW CONSOLIDATED REPORT
###############################################################################

# --- Helper function to get the row of the max usage for a given column, handling empty/missing columns ---
def get_max_usage_row(group, usage_col):
    """
    Returns a single row (as a Series) within `group` that has the maximum value in `usage_col`.
    If `usage_col` does not exist or is entirely NaN, returns None.
    """
    # If the column doesn't exist or has all null values, return None
    if usage_col not in group.columns or group[usage_col].dropna().empty:
        return None
    max_val = group[usage_col].max()
    # Filter to rows where usage_col equals the maximum value.
    max_rows = group[group[usage_col] == max_val]
    # Return only the first row among those with the maximum value.
    return max_rows.iloc[0]

def extract_usage_details(group):
    """
    For a single group of rows (all links with the same ID), find the row with the max usage for each usage field (Gbps)
    and extract the relevant columns.
    Booleans are set to True if at least one row in the group is True.
    """
    # We'll create a dict to hold the final data for this ID.
    out = {}

    # Basic info columns (these are assumed to be consistent within the group)
    first_row = group.iloc[0]
    out['ID']                = first_row.get('ID', None)
    out['NodeA']             = first_row.get('NodeA', None)
    out['NodeB']             = first_row.get('NodeB', None)
    out['CapacityGbps']      = first_row.get('CapacityGbps', None)
    out['ConfNormalThrPct']  = first_row.get('ConfNormalThrPct', None)
    out['ConfNormalThrGbps'] = first_row.get('ConfNormalThrGbps', None)
    out['ConfFailThrPct']    = first_row.get('ConfFailThrPct', None)
    out['ConfFailThrGbps']   = first_row.get('ConfFailThrGbps', None)

    # Normal usage
    normal_row = get_max_usage_row(group, 'NormalUsageGbps')
    out['NormalEnabled']     = bool(group['NormalEnabled'].any()) if 'NormalEnabled' in group.columns else False
    out['NormalUsagePct']    = normal_row.get('NormalUsagePct', np.nan) if normal_row is not None else np.nan
    out['NormalUsageGbps']   = normal_row.get('NormalUsageGbps', np.nan) if normal_row is not None else np.nan
    out['NormalUsageExceed'] = bool(group['NormalUsageExceed'].any()) if 'NormalUsageExceed' in group.columns else False
    out['NormalDateTime']    = normal_row.get('NormalDateTime', pd.NaT) if normal_row is not None else pd.NaT

    # 1 Link Fail usage
    one_link_row = get_max_usage_row(group, '1LinkFailUsageGbps')
    out['1LinkFailEnabled']     = bool(group['1LinkFailEnabled'].any()) if '1LinkFailEnabled' in group.columns else False
    out['1LinkFailScenario']    = one_link_row.get('1LinkFailScenario', None) if one_link_row is not None else None
    out['1LinkFailUsagePct']    = one_link_row.get('1LinkFailUsagePct', np.nan) if one_link_row is not None else np.nan
    out['1LinkFailUsageGbps']   = one_link_row.get('1LinkFailUsageGbps', np.nan) if one_link_row is not None else np.nan
    out['1LinkFailUsageExceed'] = bool(group['1LinkFailUsageExceed'].any()) if '1LinkFailUsageExceed' in group.columns else False
    out['1LinkFailUsageTime']   = one_link_row.get('1LinkFailUsageTime', pd.NaT) if one_link_row is not None else pd.NaT

    # 2 Link Fail usage
    two_link_row = get_max_usage_row(group, '2LinkFailUsageGbps')
    out['2LinkFailEnabled']     = bool(group['2LinkFailEnabled'].any()) if '2LinkFailEnabled' in group.columns else False
    out['2LinkFailScenario']    = two_link_row.get('2LinkFailScenario', None) if two_link_row is not None else None
    out['2LinkFailUsagePct']    = two_link_row.get('2LinkFailUsagePct', np.nan) if two_link_row is not None else np.nan
    out['2LinkFailUsageGbps']   = two_link_row.get('2LinkFailUsageGbps', np.nan) if two_link_row is not None else np.nan
    out['2LinkFailUsageExceed'] = bool(group['2LinkFailUsageExceed'].any()) if '2LinkFailUsageExceed' in group.columns else False
    out['2LinkFailUsageTime']   = two_link_row.get('2LinkFailUsageTime', pd.NaT) if two_link_row is not None else pd.NaT

    # 1 Node Fail usage
    one_node_row = get_max_usage_row(group, '1NodeFailUsageGbps')
    out['1NodeFailEnabled']     = bool(group['1NodeFailEnabled'].any()) if '1NodeFailEnabled' in group.columns else False
    out['1NodeFailScenario']    = one_node_row.get('1NodeFailScenario', None) if one_node_row is not None else None
    out['1NodeFailUsagePct']    = one_node_row.get('1NodeFailUsagePct', np.nan) if one_node_row is not None else np.nan
    out['1NodeFailUsageGbps']   = one_node_row.get('1NodeFailUsageGbps', np.nan) if one_node_row is not None else np.nan
    out['1NodeFailUsageExceed'] = bool(group['1NodeFailUsageExceed'].any()) if '1NodeFailUsageExceed' in group.columns else False
    out['1NodeFailUsageTime']   = one_node_row.get('1NodeFailUsageTime', pd.NaT) if one_node_row is not None else pd.NaT

    # Node+1 Link Fail usage
    node_plus_link_row = get_max_usage_row(group, 'Node+1LinkFailUsageGbps')
    out['Node+1LinkFailEnabled']     = bool(group['Node+1LinkFailEnabled'].any()) if 'Node+1LinkFailEnabled' in group.columns else False
    out['Node+1LinkFailScenario']    = node_plus_link_row.get('Node+1LinkFailScenario', None) if node_plus_link_row is not None else None
    out['Node+1LinkFailUsagePct']    = node_plus_link_row.get('Node+1LinkFailUsagePct', np.nan) if node_plus_link_row is not None else np.nan
    out['Node+1LinkFailUsageGbps']   = node_plus_link_row.get('Node+1LinkFailUsageGbps', np.nan) if node_plus_link_row is not None else np.nan
    out['Node+1LinkFailUsageExceed'] = bool(group['Node+1LinkFailUsageExceed'].any()) if 'Node+1LinkFailUsageExceed' in group.columns else False
    out['Node+1LinkFailUsageTime']   = node_plus_link_row.get('Node+1LinkFailUsageTime', pd.NaT) if node_plus_link_row is not None else pd.NaT

    # Finally, consolidated upgrade flags
    out['MustUpgrade']   = bool(group['MustUpgrade'].any()) if 'MustUpgrade' in group.columns else False
    out['ShouldUpgrade'] = bool(group['ShouldUpgrade'].any()) if 'ShouldUpgrade' in group.columns else False

    return pd.Series(out)

###############################################################################
# HUMAN READABLE CONSOLIDATED REPORT
###############################################################################

def build_human_report(df_raw):
    df_human= df_raw.copy()

    # Helper formatting functions
    def mark_upgrade(scenario, exceed):
        if scenario in MUST_UPGRADE_SCENARIOS and exceed:
            return f" (**)"
        elif scenario in SHOULD_UPGRADE_SCENARIOS and exceed:
            return f" (*)"
        else:
            return ""

    def normal_thr(row):
        return f"{row['ConfNormalThrPct']*100:.0f}% / {row['ConfNormalThrGbps']:.0f}G"

    def fail_thr(row):
        return f"{row['ConfFailThrPct']*100:.0f}% / {row['ConfFailThrGbps']:.0f}G"

    def normal_usage(row):
        return f"{row['NormalUsagePct']*100:.0f}% / {row['NormalUsageGbps']:.1f}G" + mark_upgrade("normal", row['NormalUsageExceed'])

    def onelinkfail_usage(row):
        return f"{row['1LinkFailUsagePct']*100:.0f}% / {row['1LinkFailUsageGbps']:.1f}G" + mark_upgrade("onelinkfail", row['1LinkFailUsageExceed'])

    def twolinkfail_usage(row):
        return f"{row['2LinkFailUsagePct']*100:.0f}% / {row['2LinkFailUsageGbps']:.1f}G" + mark_upgrade("twolinkfail", row['2LinkFailUsageExceed'])

    def onenodefail_usage(row):
        return f"{row['1NodeFailUsagePct']*100:.0f}% / {row['1NodeFailUsageGbps']:.1f}G" + mark_upgrade("onenodefail", row['1NodeFailUsageExceed'])

    def nodelinkfail_usage(row):
        return f"{row['Node+1LinkFailUsagePct']*100:.0f}% / {row['Node+1LinkFailUsageGbps']:.1f}G" + mark_upgrade("nodelinkfail", row['Node+1LinkFailUsageExceed'])

    def upgrade(row):
        return "MUST (**)" if row['MustUpgrade'] else ("SHOULD (*)" if row['ShouldUpgrade'] else "NO")

    df_human['ConfNormalThr']       = df_human.apply(normal_thr, axis=1)
    df_human['ConfFailThr']         = df_human.apply(fail_thr, axis=1)
    df_human['NormalUsage']         = df_human.apply(normal_usage, axis=1)
    df_human['1LinkFailUsage']      = df_human.apply(onelinkfail_usage, axis=1)
    df_human['2LinkFailUsage']      = df_human.apply(twolinkfail_usage, axis=1)
    df_human['1NodeFailUsage']      = df_human.apply(onenodefail_usage, axis=1)
    df_human['Node+1LinkFailUsage'] = df_human.apply(nodelinkfail_usage, axis=1)
    df_human['Upgrade']             = df_human.apply(upgrade, axis=1)

    # Drop unused columns
    df_human.drop(['ConfNormalThrPct','ConfNormalThrGbps'],axis=1, inplace=True)
    df_human.drop(['ConfFailThrPct','ConfFailThrGbps'],axis=1, inplace=True)
    df_human.drop(['NormalUsagePct','NormalUsageGbps','NormalUsageExceed'],axis=1, inplace=True)
    df_human.drop(['1LinkFailUsagePct','1LinkFailUsageGbps','1LinkFailUsageExceed','1LinkFailUsageTime'],axis=1, inplace=True)
    df_human.drop(['2LinkFailUsagePct','2LinkFailUsageGbps','2LinkFailUsageExceed','2LinkFailUsageTime'],axis=1, inplace=True)
    df_human.drop(['1NodeFailUsagePct','1NodeFailUsageGbps','1NodeFailUsageExceed','1NodeFailUsageTime'],axis=1, inplace=True)
    df_human.drop(['Node+1LinkFailUsagePct','Node+1LinkFailUsageGbps','Node+1LinkFailUsageExceed','Node+1LinkFailUsageTime'],axis=1, inplace=True)
    df_human.drop(['MustUpgrade','ShouldUpgrade'],axis=1, inplace=True)

    # Replace NaN and NaT values with "N/A"
    df_human['NormalDateTime'] = df_human['NormalDateTime'].astype(object)
    df_human.fillna("N/A", inplace=True)

    return df_human[['NodeA', 'NodeB', 'CapacityGbps', 'ConfNormalThr', 'ConfFailThr',
                    'NormalUsage', 'NormalDateTime',
                    '1LinkFailScenario', '1LinkFailUsage',
                    '2LinkFailScenario', '2LinkFailUsage',
                    '1NodeFailScenario', '1NodeFailUsage',
                    'Node+1LinkFailScenario', 'Node+1LinkFailUsage',
                    'Upgrade']]

###############################################################################
# FILE FUNCTIONS
###############################################################################

def find_files_by_timeframe(directory, prefix, suffix, start_datetime, end_datetime):
    # List all raw reports in directory
    all_raw_reports = [
        file for file in os.listdir(directory)
        if os.path.isfile(os.path.join(directory, file))
            and file.startswith(prefix)
            and file.endswith(suffix)
            and re.search(ISO8601_REGEXP, file)
    ]

    # Filter to files that match the timestamp pattern within the specified datetime range
    matching_files = []
    for file in all_raw_reports:
        match = re.search(ISO8601_REGEXP, file)
        file_date = datetime.strptime(match.group(), ISO8601_FORMAT).replace(tzinfo=timezone.utc)

        if start_datetime <= file_date <= end_datetime:
            matching_files.append(os.path.join(directory, file))

    return matching_files


def store_consolidated(df_consolidated, directory, prefix, suffix):
    path = Path(directory)
    path.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist

    # Create a ISO8601 basic format UTC timestamped filename
    timestamp = datetime.now(timezone.utc).strftime(ISO8601_FORMAT)
    filename = f'{prefix}{timestamp}.{suffix}'

    if suffix == "csv":
        df_consolidated.to_csv(os.path.join(path, filename), sep=',', encoding='utf-8', date_format=ISO8601_FORMAT, header=True)

    elif suffix == "txt":
        markdown = df_consolidated.to_markdown(headers='keys', tablefmt='psql')
        # Write the markdown string to a file
        with open(os.path.join(path, filename), "w") as file:
            file.write(markdown)


###############################################################################
# MAIN
###############################################################################

def main():
    # Parse commandline arguments
    parser = argparse.ArgumentParser(description='Script usage:')
    parser.add_argument('--daily', action='store_true', help='Create daily report (past day)')
    parser.add_argument('--weekly', action='store_true', help='Create weekly report (past week)')
    parser.add_argument('--monthly', action='store_true', help='Create monthly report (past month)')
    parser.add_argument('--quarterly', action='store_true', help='Create quarterly report (past quarter)')
    parser.add_argument('--yearly', action='store_true', help='Create yearly report (past year)')

    if len(sys.argv) == 1:
        # No arguments were provided; print help message.
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args()
    today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
    first_day_of_current_month = today.replace(day=1)

    if args.daily:
        start_datetime= today - timedelta(days=1)
        end_datetime  = today
    elif args.weekly:
        start_datetime= today - timedelta(weeks=1)
        end_datetime  = today
    elif args.monthly:
        # First day of last month
        start_datetime= (first_day_of_current_month - timedelta(days=1)).replace(day=1)
        # First day of current month
        end_datetime  = first_day_of_current_month
    elif args.quarterly:
        # Approximates the quarter as 90 days before the start of the current month.
        start_datetime= (first_day_of_current_month - timedelta(days=1)).replace(day=1) - timedelta(days=(first_day_of_current_month.month - 1) % 3 * 30)
        end_datetime  = (first_day_of_current_month - timedelta(days=1)).replace(day=1)
    elif args.yearly:
        # First day of the previous year
        start_datetime= today.replace(year=today.year - 1, month=1, day=1)
        end_datetime  = today.replace(year=today.year, month=1, day=1)

    matching_files= find_files_by_timeframe(RAW_REPORT_DIRECTORY, RAW_REPORT_FILE_PREFIX, RAW_REPORT_FILE_SUFFIX, start_datetime, end_datetime)

    if len(matching_files) > 0:
        print(f"Generating consolidated report for {len(matching_files)} raw reports for timeframe {start_datetime} through {end_datetime}")

        # List of columns that should be parsed as dates from CSV
        date_columns = ['NormalDateTime', '1LinkFailUsageTime', '2LinkFailUsageTime', '1NodeFailUsageTime', 'Node+1LinkFailUsageTime']

        # Read and concat CSVs
        dataframes= [pd.read_csv(file, sep=',', encoding='utf-8', parse_dates=date_columns, date_format=ISO8601_FORMAT, float_precision='round_trip') for file in matching_files]
        concat_df = pd.concat(dataframes)

        # Walk over the results for each link and extract and store the highest usage for each scenario
        results = []
        for id_val, group in concat_df.groupby('ID'):
            details = extract_usage_details(group)
            # Overwrite ID with the group key to be sure
            details['ID'] = id_val
            results.append(details)

        consolidated_raw = pd.DataFrame(results)
        consolidated_raw.set_index("ID", inplace=True)
        store_consolidated(consolidated_raw, CONSOLIDATED_REPORT_DIRECTORY, CONSOLIDATED_REPORT_FILE_PREFIX, CONSOLIDATED_REPORT_FILE_SUFFIX_RAW)

        consolidated_human= build_human_report(consolidated_raw)
        store_consolidated(consolidated_human, CONSOLIDATED_REPORT_DIRECTORY, CONSOLIDATED_REPORT_FILE_PREFIX, CONSOLIDATED_REPORT_FILE_SUFFIX_HUMAN)

    else:
        print(f"No raw files found for timeframe {start_datetime} through {end_datetime}")

if __name__=="__main__":
    main()