From fef8db90b841646110ef035b63c82e2b30d8320d Mon Sep 17 00:00:00 2001 From: Mohammad Torkashvand <mohammad.torkashvand@geant.org> Date: Thu, 14 Sep 2023 13:10:48 +0200 Subject: [PATCH] refactor publisher and make excel files consistent --- compendium_v2/background_task/__init__.py | 0 compendium_v2/conversion/conversion.py | 6 +- ...rvey_publisher_2022.py => db_publisher.py} | 0 .../excel_parser.py} | 37 +++---- ...vey_publisher_v1.py => excel_publisher.py} | 21 ++-- .../2021_Organisation_DataSeries.xlsx | Bin .../2022_Connected_Users_DataSeries.xlsx | Bin .../2022_Networks_DataSeries.xlsx | Bin ...NREN-Services-prefills_2023_Recovered.xlsx | Bin test/test_excel_publisher.py | 94 ++++++++++++++++++ 10 files changed, 128 insertions(+), 30 deletions(-) delete mode 100644 compendium_v2/background_task/__init__.py rename compendium_v2/publishers/{survey_publisher_2022.py => db_publisher.py} (100%) rename compendium_v2/{background_task/parse_excel_data.py => publishers/excel_parser.py} (91%) rename compendium_v2/publishers/{survey_publisher_v1.py => excel_publisher.py} (92%) rename compendium_v2/{background_task/xlsx => resources}/2021_Organisation_DataSeries.xlsx (100%) rename compendium_v2/{background_task/xlsx => resources}/2022_Connected_Users_DataSeries.xlsx (100%) rename compendium_v2/{background_task/xlsx => resources}/2022_Networks_DataSeries.xlsx (100%) rename compendium_v2/{conversion => resources}/NREN-Services-prefills_2023_Recovered.xlsx (100%) create mode 100644 test/test_excel_publisher.py diff --git a/compendium_v2/background_task/__init__.py b/compendium_v2/background_task/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/compendium_v2/conversion/conversion.py b/compendium_v2/conversion/conversion.py index a963b446..615165b0 100644 --- a/compendium_v2/conversion/conversion.py +++ b/compendium_v2/conversion/conversion.py @@ -30,7 +30,9 @@ setup_logging() logger = logging.getLogger('conversion') -EXCEL_FILE = os.path.join(os.path.dirname(__file__), "NREN-Services-prefills_2023_Recovered.xlsx") +resources_dir = f"{os.path.abspath(os.path.join( os.path.dirname(__file__), os.pardir))}/resources" + +EXCEL_NREN_SERVICES_2023 = os.path.join(resources_dir, "NREN-Services-prefills_2023_Recovered.xlsx") def query_nren(nren_id: int): @@ -112,7 +114,7 @@ def convert_answers(answers): def load_service_data(): - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_NREN_SERVICES_2023, data_only=True, read_only=True) ws = wb["Sheet1"] rows = list(ws.rows) diff --git a/compendium_v2/publishers/survey_publisher_2022.py b/compendium_v2/publishers/db_publisher.py similarity index 100% rename from compendium_v2/publishers/survey_publisher_2022.py rename to compendium_v2/publishers/db_publisher.py diff --git a/compendium_v2/background_task/parse_excel_data.py b/compendium_v2/publishers/excel_parser.py similarity index 91% rename from compendium_v2/background_task/parse_excel_data.py rename to compendium_v2/publishers/excel_parser.py index aba3ef6f..c1139266 100644 --- a/compendium_v2/background_task/parse_excel_data.py +++ b/compendium_v2/publishers/excel_parser.py @@ -9,14 +9,17 @@ setup_logging() logger = logging.getLogger(__name__) -EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2021_Organisation_DataSeries.xlsx") -NETWORK_EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2022_Networks_DataSeries.xlsx") +resources_dir = f"{os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))}/resources" +EXCEL_ORGANISATION_2021 = os.path.join(resources_dir, "2021_Organisation_DataSeries.xlsx") +EXCEL_CONNECTED_USERS_2022 = os.path.join(resources_dir, "2022_Connected_Users_DataSeries.xlsx") +EXCEL_NETWORKS_2022 = os.path.join(os.path.dirname(__file__), "2022_Networks_DataSeries.xlsx") -def fetch_budget_excel_data(): + +def fetch_budget_data(): # load the xlsx file sheet_name = "1. Budget" - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet ws = wb[sheet_name] @@ -34,9 +37,9 @@ def fetch_budget_excel_data(): yield nren.upper(), budget, year -def fetch_funding_excel_data(): +def fetch_funding_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "2. Income Sources" @@ -117,9 +120,9 @@ def fetch_funding_excel_data(): yield from create_points_for_year_from_2018(ws2, 8, 51, 2021, 11, 12) -def fetch_charging_structure_excel_data(): +def fetch_charging_structure_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "3. Charging mechanism" @@ -184,9 +187,9 @@ def fetch_charging_structure_excel_data(): yield from create_points_for_2019(3, 46, 2019, 6) -def fetch_staffing_excel_data(): +def fetch_staffing_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "4. Staff" @@ -232,9 +235,9 @@ def fetch_staffing_excel_data(): yield from create_points_for_year(2021, 2, 5) -def fetch_staff_function_excel_data(): +def fetch_staff_function_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "5. Staff by Function" @@ -300,9 +303,9 @@ def fetch_staff_function_excel_data(): yield from create_points_for_year(2021, 3, 5) -def fetch_ecproject_excel_data(): +def fetch_ecproject_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "7. EC Projects" @@ -332,9 +335,9 @@ def fetch_ecproject_excel_data(): yield from create_points_for_year(2021, 1, 173) -def fetch_organization_excel_data(): +def fetch_organization_data(): # load the xlsx file - wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True) # select the active worksheet sheet_name = "Organization" @@ -352,7 +355,7 @@ def fetch_organization_excel_data(): def fetch_traffic_excel_data(): # load the xlsx file - wb = openpyxl.load_workbook(NETWORK_EXCEL_FILE, data_only=True, read_only=True) + wb = openpyxl.load_workbook(EXCEL_NETWORKS_2022, data_only=True, read_only=True) # select the active worksheet sheet_name = "Estimated_Traffic TByte" diff --git a/compendium_v2/publishers/survey_publisher_v1.py b/compendium_v2/publishers/excel_publisher.py similarity index 92% rename from compendium_v2/publishers/survey_publisher_v1.py rename to compendium_v2/publishers/excel_publisher.py index a6e2376f..b7843c28 100644 --- a/compendium_v2/publishers/survey_publisher_v1.py +++ b/compendium_v2/publishers/excel_publisher.py @@ -1,5 +1,5 @@ """ -survey_publisher_v1 +excel_publisher ========================= This module loads the survey data from before 2022 from an excel file. @@ -16,15 +16,14 @@ from sqlalchemy import select import compendium_v2 from compendium_v2.environment import setup_logging -from compendium_v2.background_task import parse_excel_data from compendium_v2.config import load from compendium_v2.db import db, model from compendium_v2.survey_db import model as survey_model -from compendium_v2.publishers import helpers +from compendium_v2.publishers import helpers, excel_parser setup_logging() -logger = logging.getLogger('survey-publisher-v1') +logger = logging.getLogger(__name__) def db_budget_migration(nren_dict): @@ -55,7 +54,7 @@ def db_budget_migration(nren_dict): db.session.merge(budget_entry) # Import the data from excel sheet to database - exceldata = parse_excel_data.fetch_budget_excel_data() + exceldata = excel_parser.fetch_budget_data() for abbrev, budget, year in exceldata: if abbrev not in nren_dict: @@ -77,7 +76,7 @@ def db_budget_migration(nren_dict): def db_funding_migration(nren_dict): # Import the data to database - data = parse_excel_data.fetch_funding_excel_data() + data = excel_parser.fetch_funding_data() for (abbrev, year, client_institution, european_funding, @@ -108,7 +107,7 @@ def db_funding_migration(nren_dict): def db_charging_structure_migration(nren_dict): # Import the data to database - data = parse_excel_data.fetch_charging_structure_excel_data() + data = excel_parser.fetch_charging_structure_data() for (abbrev, year, charging_structure) in data: if abbrev not in nren_dict: @@ -126,7 +125,7 @@ def db_charging_structure_migration(nren_dict): def db_staffing_migration(nren_dict): - staff_data = parse_excel_data.fetch_staffing_excel_data() + staff_data = excel_parser.fetch_staffing_data() nren_staff_map = {} for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data: @@ -145,7 +144,7 @@ def db_staffing_migration(nren_dict): non_technical_fte=0 ) - function_data = parse_excel_data.fetch_staff_function_excel_data() + function_data = excel_parser.fetch_staff_function_data() for (abbrev, year, technical_fte, non_technical_fte) in function_data: if abbrev not in nren_dict: logger.warning(f'{abbrev} unknown. Skipping staff function data.') @@ -179,7 +178,7 @@ def db_staffing_migration(nren_dict): def db_ecprojects_migration(nren_dict): - ecproject_data = parse_excel_data.fetch_ecproject_excel_data() + ecproject_data = excel_parser.fetch_ecproject_data() for (abbrev, year, project) in ecproject_data: if abbrev not in nren_dict: logger.warning(f'{abbrev} unknown. Skipping.') @@ -192,7 +191,7 @@ def db_ecprojects_migration(nren_dict): def db_organizations_migration(nren_dict): - organization_data = parse_excel_data.fetch_organization_excel_data() + organization_data = excel_parser.fetch_organization_data() for (abbrev, year, org) in organization_data: if abbrev not in nren_dict: logger.warning(f'{abbrev} unknown. Skipping.') diff --git a/compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx b/compendium_v2/resources/2021_Organisation_DataSeries.xlsx similarity index 100% rename from compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx rename to compendium_v2/resources/2021_Organisation_DataSeries.xlsx diff --git a/compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx b/compendium_v2/resources/2022_Connected_Users_DataSeries.xlsx similarity index 100% rename from compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx rename to compendium_v2/resources/2022_Connected_Users_DataSeries.xlsx diff --git a/compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx b/compendium_v2/resources/2022_Networks_DataSeries.xlsx similarity index 100% rename from compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx rename to compendium_v2/resources/2022_Networks_DataSeries.xlsx diff --git a/compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx b/compendium_v2/resources/NREN-Services-prefills_2023_Recovered.xlsx similarity index 100% rename from compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx rename to compendium_v2/resources/NREN-Services-prefills_2023_Recovered.xlsx diff --git a/test/test_excel_publisher.py b/test/test_excel_publisher.py new file mode 100644 index 00000000..de0bc41b --- /dev/null +++ b/test/test_excel_publisher.py @@ -0,0 +1,94 @@ +import os + +from sqlalchemy import select, func + +from compendium_v2 import db +from compendium_v2.db import model +from compendium_v2.publishers.excel_publisher import _cli + +EXCEL_FILE = os.path.join(os.path.dirname(__file__), "data", "2021_Organisation_DataSeries.xlsx") + + +def test_publisher(app_with_survey_db, mocker, dummy_config): + mocker.patch('compendium_v2.publishers.excel_parser.EXCEL_ORGANISATION_2021', EXCEL_FILE) + + with app_with_survey_db.app_context(): + nren_names = ['SURF', 'KIFU', 'University of Malta', 'ASNET-AM', 'SIKT', 'LAT', 'RASH', 'ANAS', 'GRNET', 'CSC'] + db.session.add_all([model.NREN(name=nren_name, country='country') for nren_name in nren_names]) + db.session.commit() + + _cli(dummy_config, app_with_survey_db) + + with app_with_survey_db.app_context(): + budget_count = db.session.scalar(select(func.count(model.BudgetEntry.year))) + assert budget_count + funding_source_count = db.session.scalar(select(func.count(model.FundingSource.year))) + assert funding_source_count + charging_structure_count = db.session.scalar(select(func.count(model.ChargingStructure.year))) + assert charging_structure_count + staff_data = db.session.scalars(select(model.NrenStaff).order_by(model.NrenStaff.year.asc())).all() + + # data should only be saved for the NRENs we have saved in the database + staff_data_nrens = set([staff.nren.name for staff in staff_data]) + assert len(staff_data_nrens) == len(nren_names) - 1 # no UoM data + + kifu_data = [staff for staff in staff_data if staff.nren.name == 'KIFU'] + # check that the data is saved correctly for KIFU, it should be OK for the rest then.. + assert len(kifu_data) == 6 + + assert kifu_data[0].year == 2016 + assert kifu_data[0].permanent_fte == 100 + assert kifu_data[0].subcontracted_fte == 2 + assert kifu_data[0].technical_fte == 0 + assert kifu_data[0].non_technical_fte == 0 + + assert kifu_data[1].year == 2017 + assert kifu_data[1].permanent_fte == 80 + assert kifu_data[1].subcontracted_fte == 2 + assert kifu_data[1].technical_fte == 0 + assert kifu_data[1].non_technical_fte == 0 + + assert kifu_data[2].year == 2018 + assert kifu_data[2].permanent_fte == 80 + assert kifu_data[2].subcontracted_fte == 3 + assert kifu_data[2].technical_fte == 0 + assert kifu_data[2].non_technical_fte == 0 + + assert kifu_data[3].year == 2019 + assert kifu_data[3].permanent_fte == 148 + assert kifu_data[3].subcontracted_fte == 4 + assert kifu_data[3].technical_fte == 117 + assert kifu_data[3].non_technical_fte == 33 + + assert kifu_data[4].year == 2020 + assert kifu_data[4].permanent_fte == 190 + assert kifu_data[4].subcontracted_fte == 3 + assert kifu_data[4].technical_fte == 133 + assert kifu_data[4].non_technical_fte == 60 + + assert kifu_data[5].year == 2021 + assert kifu_data[5].permanent_fte == 178 + assert kifu_data[5].subcontracted_fte == 3 + assert kifu_data[5].technical_fte == 133 + assert kifu_data[5].non_technical_fte == 45 + + ecproject_data = db.session.scalars(select(model.ECProject)).all() + # test a couple of random entries + surf2017 = [x for x in ecproject_data if x.nren.name == 'SURF' and x.year == 2017] + assert len(surf2017) == 1 + assert surf2017[0].project == 'Asterics and Magic' + + asnetam2018 = [x for x in ecproject_data if x.nren.name == 'ASNET-AM' and x.year == 2018] + assert len(asnetam2018) == 1 + assert asnetam2018[0].project == 'EaPConnect' + + kifu2019 = [x for x in ecproject_data if x.nren.name == 'KIFU' and x.year == 2019] + assert len(kifu2019) == 4 + assert kifu2019[3].project == 'SuperHeroes for Science' + + parent_data = db.session.scalars(select(model.ParentOrganization)).all() + # test a random entry + asnet2021 = [x for x in parent_data if x.nren.name == 'ASNET-AM' and x.year == 2021] + assert len(asnet2021) == 1 + assert asnet2021[0].organization\ + == 'Institute for Informatics and Automation Problems of the National Academy of Sciences of Armenia' -- GitLab