From ba3d45f7a0c9d19f71862fcced17561be6217d1e Mon Sep 17 00:00:00 2001 From: Remco Tukker <remco.tukker@geant.org> Date: Thu, 21 Sep 2023 15:25:26 +0200 Subject: [PATCH] excel publisher for connected users questions --- compendium_v2/db/presentation_models.py | 2 - compendium_v2/publishers/excel_parser.py | 387 +++++++++++++++++- compendium_v2/publishers/helpers.py | 2 + .../survey_publisher_legacy_excel.py | 187 ++++++++- 4 files changed, 566 insertions(+), 12 deletions(-) diff --git a/compendium_v2/db/presentation_models.py b/compendium_v2/db/presentation_models.py index a25200e7..52c6411c 100644 --- a/compendium_v2/db/presentation_models.py +++ b/compendium_v2/db/presentation_models.py @@ -475,7 +475,6 @@ class NetworkAutomation(db.Model): class Service(db.Model): __tablename__ = 'service' - name_key: Mapped[str128_pk] name: Mapped[str128] category: Mapped[ServiceCategory] @@ -484,7 +483,6 @@ class Service(db.Model): class NRENService(db.Model): __tablename__ = 'nren_service' - nren_id: Mapped[int_pk_fkNREN] nren: Mapped[NREN] = relationship(lazy='joined') year: Mapped[int_pk] diff --git a/compendium_v2/publishers/excel_parser.py b/compendium_v2/publishers/excel_parser.py index ce1efff4..94a30647 100644 --- a/compendium_v2/publishers/excel_parser.py +++ b/compendium_v2/publishers/excel_parser.py @@ -3,7 +3,7 @@ import logging import openpyxl from compendium_v2.conversion import mapping -from compendium_v2.db.presentation_models import FeeType +from compendium_v2.db.presentation_model_enums import CarryMechanism, ConnectivityCoverage, UserCategory, FeeType from compendium_v2.environment import setup_logging from compendium_v2.resources import get_resource_file_path @@ -12,6 +12,7 @@ setup_logging() logger = logging.getLogger(__name__) EXCEL_FILE_ORGANISATION = get_resource_file_path("2021_Organisation_DataSeries.xlsx") +EXCEL_FILE_USERS = get_resource_file_path("2022_Connected_Users_DataSeries.xlsx") EXCEL_FILE_NETWORKS = get_resource_file_path("2022_Networks_DataSeries.xlsx") EXCEL_FILE_NREN_SERVICES = get_resource_file_path("NREN-Services-prefills_2023_Recovered.xlsx") @@ -440,3 +441,387 @@ def fetch_nren_services_excel_data(): 'additional_information': additional_information.strip(), 'official_description': '', } + + +def get_category(excel_cat): + if not excel_cat: + return None + if "universit" in excel_cat.lower(): + return UserCategory.universities + if "research ins" in excel_cat.lower(): + return UserCategory.institutes + if "further" in excel_cat.lower() or "fe" == excel_cat.lower(): + return UserCategory.further_education + if "inter" in excel_cat.lower(): + return UserCategory.iros + if "cultural" in excel_cat.lower() or "librar" in excel_cat.lower(): + return UserCategory.cultural + if "hospital" in excel_cat.lower(): + return UserCategory.hospitals + if "primary" in excel_cat.lower(): + return UserCategory.primary_schools + if "secondary" in excel_cat.lower(): + return UserCategory.secondary_schools + if "govern" in excel_cat.lower(): + return UserCategory.government + if "profit" in excel_cat.lower(): + return UserCategory.for_profit_orgs + logger.warning(f'unknown user category: {excel_cat}') + + +def fetch_remit_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Connectivity Remit" + ws = wb[sheet_name] + rows = list(ws.rows) + + def get_remit(excel_remit): + if not excel_remit: + return None + if "including transit" in excel_remit.lower(): + return ConnectivityCoverage.yes_incl_other + if "national nren" in excel_remit.lower(): + return ConnectivityCoverage.yes_national_nren + if "some circ" in excel_remit.lower(): + return ConnectivityCoverage.sometimes + if "policy reas" in excel_remit.lower(): + return ConnectivityCoverage.no_policy + if "financial" in excel_remit.lower(): + return ConnectivityCoverage.no_financial + if "other reason" in excel_remit.lower(): + return ConnectivityCoverage.no_other + if "unsure" in excel_remit.lower(): + return ConnectivityCoverage.unsure + logger.warning(f'unknown remit: {excel_remit}') + + result = {} + def create_points_for_year(year, start_column): + for i in range(8, 51): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for col in range(start_column + 2, start_column + 21, 2): + c = col + if year == 2021 and col > 30: + c += 2 + category = get_category(rows[7][c].value) + remit = get_remit(rows[i][c].value) + if category and remit: + result[(nren_name, year, category)] = remit + + create_points_for_year(2019, 72) + create_points_for_year(2020, 50) + create_points_for_year(2021, 26) + create_points_for_year(2022, 3) + return result + + +def fetch_nr_connected_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Connected Institutions" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(5, 48): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[4][c].value) + nr_connected = int(rows[i][c].value) if rows[i][c].value else None + if category and nr_connected: + result[(nren_name, year, category)] = nr_connected + + create_points_for_year(2019, 39) + create_points_for_year(2020, 27) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_market_share_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Table Market Share" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(8, 51): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[7][c].value) + percentage_connected = float(rows[i][c].value) if rows[i][c].value else None + if category and percentage_connected: + result[(nren_name, year, category)] = percentage_connected + + create_points_for_year(2017, 64) + create_points_for_year(2018, 52) + create_points_for_year(2019, 40) + create_points_for_year(2020, 28) + create_points_for_year(2021, 16) + create_points_for_year(2022, 3) + return result + + +def fetch_users_served_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Users" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(4, 47): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[3][c].value) + users_connected = int(rows[i][c].value) if rows[i][c].value else None + if category and users_connected: + result[(nren_name, year, category)] = users_connected + + create_points_for_year(2019, 40) + create_points_for_year(2020, 28) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_typical_speed_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Table _Typical IP Link capacity" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(33, 76): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[32][c].value) + typical_speed = int(rows[i][c].value) if rows[i][c].value else None + if category and typical_speed: + result[(nren_name, year, category)] = typical_speed + + create_points_for_year(2017, 75) + create_points_for_year(2018, 50) + create_points_for_year(2019, 38) + create_points_for_year(2020, 26) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_highest_speed_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Table _Highest IP Link capacity" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(33, 76): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[32][c].value) + highest_speed = int(rows[i][c].value) if rows[i][c].value else None + if category and highest_speed: + result[(nren_name, year, category)] = highest_speed + + create_points_for_year(2017, 64) + create_points_for_year(2018, 51) + create_points_for_year(2019, 38) + create_points_for_year(2020, 26) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_highest_speed_proportion_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Aver High cap conn Share" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(5, 48): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[4][c].value) + highest_speed = float(rows[i][c].value) if rows[i][c].value else None + if category and highest_speed: + result[(nren_name, year, category)] = highest_speed + + create_points_for_year(2020, 27) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_carriers_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Traffic carriers" + ws = wb[sheet_name] + rows = list(ws.rows) + + def get_carrier(excel_carrier): + if not excel_carrier: + return None + if "comme" in excel_carrier.lower(): + return CarryMechanism.commercial_provider_backbone + if "man" in excel_carrier.lower(): + return CarryMechanism.man + if "local loop" in excel_carrier.lower(): + return CarryMechanism.nren_local_loops + if "other" in excel_carrier.lower(): + return CarryMechanism.other + if "regional" in excel_carrier.lower(): + return CarryMechanism.regional_nren_backbone + logger.warning(f'unknown carrier: {excel_carrier}') + + result = {} + def create_points_for_year(year, start_column): + for i in range(3, 46): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[2][c].value) + carrier = get_carrier(rows[i][c].value) + if category and carrier: + result[(nren_name, year, category)] = carrier + + create_points_for_year(2019, 40) + create_points_for_year(2020, 27) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_growth_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_NETWORKS, data_only=True, read_only=True) + sheet_name = "Table Traffic Growth % " + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(5, 46): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 11): + category = get_category(rows[4][c].value) + growth = float(rows[i][c].value) if rows[i][c].value else None + if category and growth: + result[(nren_name, year, category)] = growth + + create_points_for_year(2019, 40) + create_points_for_year(2020, 26) + create_points_for_year(2021, 14) + create_points_for_year(2022, 2) + return result + + +def fetch_average_traffic_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Average Traffic" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(5, 48): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 21, 2): + category = get_category(rows[3][c].value) + from_inst = int(rows[i][c].value) if rows[i][c].value else None + to_inst = int(rows[i][c+1].value) if rows[i][c+1].value else None + if category and (from_inst or to_inst): + result[(nren_name, year, category)] = (from_inst, to_inst) + + create_points_for_year(2019, 68) + create_points_for_year(2020, 46) + create_points_for_year(2021, 24) + create_points_for_year(2022, 2) + return result + + +def fetch_peak_traffic_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Peak traffic" + ws = wb[sheet_name] + rows = list(ws.rows) + + result = {} + def create_points_for_year(year, start_column): + for i in range(6, 49): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + for c in range(start_column + 1, start_column + 21, 2): + category = get_category(rows[4][c].value) + from_inst = int(rows[i][c].value) if rows[i][c].value else None + to_inst = int(rows[i][c+1].value) if rows[i][c+1].value else None + if category and (from_inst or to_inst): + result[(nren_name, year, category)] = (from_inst, to_inst) + + create_points_for_year(2019, 70) + create_points_for_year(2020, 47) + create_points_for_year(2021, 24) + create_points_for_year(2022, 2) + return result + + +def fetch_remote_campuses_excel_data(): + wb = openpyxl.load_workbook(EXCEL_FILE_USERS, data_only=True, read_only=True) + sheet_name = "Foreign Campuses" + ws = wb[sheet_name] + rows = list(ws.rows) + + def create_points_for_year(year, start_column): + for i in range(5, 48): + nren_name = rows[i][start_column].value + if not nren_name: + continue + nren_name = nren_name.upper() + have_remote = rows[i][start_column + 1].value + connectivity = rows[i][start_column + 2].value + country = rows[i][start_column + 3].value + connected_to_r_e = rows[i][start_column + 4].value + if have_remote and have_remote.upper() == "YES": + connectivity = connectivity.upper() == "YES" if connectivity else False + connected_to_r_e = connected_to_r_e not in [None, "-", "Not connected.", "We do not know"] + country = country or "" + yield nren_name, year, connectivity, country, connected_to_r_e + + yield from create_points_for_year(2019, 22) + yield from create_points_for_year(2020, 16) + yield from create_points_for_year(2021, 10) + yield from create_points_for_year(2022, 4) diff --git a/compendium_v2/publishers/helpers.py b/compendium_v2/publishers/helpers.py index aeb4ecd9..1668bb90 100644 --- a/compendium_v2/publishers/helpers.py +++ b/compendium_v2/publishers/helpers.py @@ -22,6 +22,8 @@ def get_uppercase_nren_dict(): # add aliases that are used in the source data: nren_dict['ASNET'] = nren_dict['ASNET-AM'] nren_dict['KIFU (NIIF)'] = nren_dict['KIFU'] + nren_dict['KIFÜ'] = nren_dict['KIFU'] + nren_dict['NIIF/HUNGARNET'] = nren_dict['KIFU'] nren_dict['SURFNET'] = nren_dict['SURF'] nren_dict['UOM/RICERKANET'] = nren_dict['UNIVERSITY OF MALTA'] nren_dict['UOM'] = nren_dict['UNIVERSITY OF MALTA'] diff --git a/compendium_v2/publishers/survey_publisher_legacy_excel.py b/compendium_v2/publishers/survey_publisher_legacy_excel.py index dfb6b6b5..059d7d93 100644 --- a/compendium_v2/publishers/survey_publisher_legacy_excel.py +++ b/compendium_v2/publishers/survey_publisher_legacy_excel.py @@ -269,18 +269,187 @@ def db_nren_services_migration(nren_dict): db.session.commit() +def db_connected_proportion_migration(nren_dict): + remit = excel_parser.fetch_remit_excel_data() + nr_connected = excel_parser.fetch_nr_connected_excel_data() + market_share = excel_parser.fetch_market_share_excel_data() + users_served = excel_parser.fetch_users_served_excel_data() + + all_entry_keys = set() + all_entry_keys.update(remit.keys()) + all_entry_keys.update(nr_connected.keys()) + all_entry_keys.update(market_share.keys()) + all_entry_keys.update(users_served.keys()) + + for key in all_entry_keys: + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + connected_proportion = presentation_models.ConnectedProportion( + nren=nren, + nren_id=nren.id, + year=year, + user_category=user_category, + coverage=remit.get(key), + number_connected=nr_connected.get(key), + market_share=market_share.get(key), + users_served=users_served.get(key) + ) + db.session.merge(connected_proportion) + + db.session.commit() + + +def db_connectivity_level_migration(nren_dict): + typical_speed = excel_parser.fetch_typical_speed_excel_data() + highest_speed = excel_parser.fetch_highest_speed_excel_data() + highest_speed_proportion = excel_parser.fetch_highest_speed_proportion_excel_data() + + all_entry_keys = set() + all_entry_keys.update(typical_speed.keys()) + all_entry_keys.update(highest_speed.keys()) + all_entry_keys.update(highest_speed_proportion.keys()) + + for key in all_entry_keys: + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + connected_proportion = presentation_models.ConnectivityLevel( + nren=nren, + nren_id=nren.id, + year=year, + user_category=user_category, + typical_speed=typical_speed.get(key), + highest_speed=highest_speed.get(key), + highest_speed_proportion=highest_speed_proportion.get(key) + ) + db.session.merge(connected_proportion) + + db.session.commit() + + +def db_connection_carrier_migration(nren_dict): + carriers = excel_parser.fetch_carriers_excel_data() + for key, carry_mechanism in carriers.items(): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + connection_carrier = presentation_models.ConnectionCarrier( + nren=nren, + nren_id=nren.id, + year=year, + user_category=user_category, + carry_mechanism=carry_mechanism + ) + db.session.merge(connection_carrier) + + db.session.commit() + + +def db_connectivity_growth_migration(nren_dict): + growth = excel_parser.fetch_growth_excel_data() + for key, growth_percent in growth.items(): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + connectivity_growth = presentation_models.ConnectivityGrowth( + nren=nren, + nren_id=nren.id, + year=year, + user_category=user_category, + growth=growth_percent + ) + db.session.merge(connectivity_growth) + + db.session.commit() + + +def db_connectivity_load_migration(nren_dict): + average = excel_parser.fetch_average_traffic_excel_data() + peak = excel_parser.fetch_peak_traffic_excel_data() + + all_entry_keys = set() + all_entry_keys.update(average.keys()) + all_entry_keys.update(peak.keys()) + + for key in all_entry_keys: + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + connectivity_load = presentation_models.ConnectivityLoad( + nren=nren, + nren_id=nren.id, + year=year, + user_category=user_category, + average_load_from_institutions=average.get(key, (None, None))[0], + average_load_to_institutions=average.get(key, (None, None))[1], + peak_load_from_institutions=peak.get(key, (None, None))[0], + peak_load_to_institutions=peak.get(key, (None, None))[1] + ) + db.session.merge(connectivity_load) + + db.session.commit() + + +def db_remote_campuses_migration(nren_dict): + campuses = excel_parser.fetch_remote_campuses_excel_data() + for (abbrev, year, connectivity, country, connected_to_r_e) in campuses: + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + connections = [] + if country: + connections.append({'country': country, 'local_r_and_e_connection': connected_to_r_e}) + + nren = nren_dict[abbrev] + connection_carrier = presentation_models.RemoteCampuses( + nren=nren, + nren_id=nren.id, + year=year, + remote_campus_connectivity=connectivity, + connections=connections + ) + db.session.merge(connection_carrier) + + db.session.commit() + + def _cli(app): with app.app_context(): nren_dict = helpers.get_uppercase_nren_dict() - db_budget_migration(nren_dict) - db_funding_migration(nren_dict) - db_charging_structure_migration(nren_dict) - db_staffing_migration(nren_dict) - db_ecprojects_migration(nren_dict) - db_organizations_migration(nren_dict) - db_traffic_volume_migration(nren_dict) - db_services_migration() - db_nren_services_migration(nren_dict) + # db_budget_migration(nren_dict) + # db_funding_migration(nren_dict) + # db_charging_structure_migration(nren_dict) + # db_staffing_migration(nren_dict) + # db_ecprojects_migration(nren_dict) + # db_organizations_migration(nren_dict) + # db_traffic_volume_migration(nren_dict) + # db_services_migration() + # db_nren_services_migration(nren_dict) + + # db_connected_proportion_migration(nren_dict) + # db_connectivity_level_migration(nren_dict) + + db_connection_carrier_migration(nren_dict) + db_connectivity_growth_migration(nren_dict) + db_connectivity_load_migration(nren_dict) + db_remote_campuses_migration(nren_dict) @click.command() -- GitLab