refactor publisher and make excel files consistent

fef8db90 · Mohammad Torkashvand · d6f91bd5 · d6f91bd5 · fef8db90 · fef8db90
Commit fef8db90 authored 1 year ago by Mohammad Torkashvand
--- a/compendium_v2/background_task/__init__.py
+++ b/compendium_v2/background_task/__init__.py
--- a/compendium_v2/conversion/conversion.py
+++ b/compendium_v2/conversion/conversion.py
@@ -30,7 +30,9 @@ setup_logging()
 logger = logging.getLogger('conversion')
-EXCEL_FILE = os.path.join(os.path.dirname(__file__), "NREN-Services-prefills_2023_Recovered.xlsx")
+resources_dir = f"{os.path.abspath(os.path.join( os.path.dirname(__file__), os.pardir))}/resources"
+EXCEL_NREN_SERVICES_2023 = os.path.join(resources_dir, "NREN-Services-prefills_2023_Recovered.xlsx")
 def query_nren(nren_id: int):
@@ -112,7 +114,7 @@ def convert_answers(answers):
 def load_service_data():
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_NREN_SERVICES_2023, data_only=True, read_only=True)
    ws = wb["Sheet1"]
    rows = list(ws.rows)

--- a/compendium_v2/publishers/survey_publisher_2022.py
+++ b/compendium_v2/publishers/survey_publisher_2022.py
--- a/compendium_v2/background_task/parse_excel_data.py
+++ b/compendium_v2/background_task/parse_excel_data.py
@@ -9,14 +9,17 @@ setup_logging()
 logger = logging.getLogger(__name__)
-EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2021_Organisation_DataSeries.xlsx")
+resources_dir = f"{os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))}/resources"
-NETWORK_EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2022_Networks_DataSeries.xlsx")
+EXCEL_ORGANISATION_2021 = os.path.join(resources_dir, "2021_Organisation_DataSeries.xlsx")
+EXCEL_CONNECTED_USERS_2022 = os.path.join(resources_dir, "2022_Connected_Users_DataSeries.xlsx")
+EXCEL_NETWORKS_2022 = os.path.join(os.path.dirname(__file__), "2022_Networks_DataSeries.xlsx")
-def fetch_budget_excel_data():
+def fetch_budget_data():
    # load the xlsx file
    sheet_name = "1. Budget"
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    ws = wb[sheet_name]
@@ -34,9 +37,9 @@ def fetch_budget_excel_data():
                yield nren.upper(), budget, year
-def fetch_funding_excel_data():
+def fetch_funding_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "2. Income Sources"
@@ -117,9 +120,9 @@ def fetch_funding_excel_data():
    yield from create_points_for_year_from_2018(ws2, 8, 51, 2021, 11, 12)
-def fetch_charging_structure_excel_data():
+def fetch_charging_structure_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "3. Charging mechanism"
@@ -184,9 +187,9 @@ def fetch_charging_structure_excel_data():
    yield from create_points_for_2019(3, 46, 2019, 6)
-def fetch_staffing_excel_data():
+def fetch_staffing_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "4. Staff"
@@ -232,9 +235,9 @@ def fetch_staffing_excel_data():
    yield from create_points_for_year(2021, 2, 5)
-def fetch_staff_function_excel_data():
+def fetch_staff_function_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "5. Staff by Function"
@@ -300,9 +303,9 @@ def fetch_staff_function_excel_data():
    yield from create_points_for_year(2021, 3, 5)
-def fetch_ecproject_excel_data():
+def fetch_ecproject_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "7. EC Projects"
@@ -332,9 +335,9 @@ def fetch_ecproject_excel_data():
    yield from create_points_for_year(2021, 1, 173)
-def fetch_organization_excel_data():
+def fetch_organization_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "Organization"
@@ -352,7 +355,7 @@ def fetch_organization_excel_data():
 def fetch_traffic_excel_data():
    # load the xlsx file
-    wb = openpyxl.load_workbook(NETWORK_EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_NETWORKS_2022, data_only=True, read_only=True)
    # select the active worksheet
    sheet_name = "Estimated_Traffic TByte"

--- a/compendium_v2/publishers/survey_publisher_v1.py
+++ b/compendium_v2/publishers/survey_publisher_v1.py
 """
-survey_publisher_v1
+excel_publisher
 =========================
 This module loads the survey data from before 2022 from an excel file.
@@ -16,15 +16,14 @@ from sqlalchemy import select
 import compendium_v2
 from compendium_v2.environment import setup_logging
-from compendium_v2.background_task import parse_excel_data
 from compendium_v2.config import load
 from compendium_v2.db import db, model
 from compendium_v2.survey_db import model as survey_model
-from compendium_v2.publishers import helpers
+from compendium_v2.publishers import helpers, excel_parser
 setup_logging()
-logger = logging.getLogger('survey-publisher-v1')
+logger = logging.getLogger(__name__)
 def db_budget_migration(nren_dict):
@@ -55,7 +54,7 @@ def db_budget_migration(nren_dict):
            db.session.merge(budget_entry)
    # Import the data from excel sheet to database
-    exceldata = parse_excel_data.fetch_budget_excel_data()
+    exceldata = excel_parser.fetch_budget_data()
    for abbrev, budget, year in exceldata:
        if abbrev not in nren_dict:
@@ -77,7 +76,7 @@ def db_budget_migration(nren_dict):
 def db_funding_migration(nren_dict):
    # Import the data to database
-    data = parse_excel_data.fetch_funding_excel_data()
+    data = excel_parser.fetch_funding_data()
    for (abbrev, year, client_institution,
            european_funding,
@@ -108,7 +107,7 @@ def db_funding_migration(nren_dict):
 def db_charging_structure_migration(nren_dict):
    # Import the data to database
-    data = parse_excel_data.fetch_charging_structure_excel_data()
+    data = excel_parser.fetch_charging_structure_data()
    for (abbrev, year, charging_structure) in data:
        if abbrev not in nren_dict:
@@ -126,7 +125,7 @@ def db_charging_structure_migration(nren_dict):
 def db_staffing_migration(nren_dict):
-    staff_data = parse_excel_data.fetch_staffing_excel_data()
+    staff_data = excel_parser.fetch_staffing_data()
    nren_staff_map = {}
    for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data:
@@ -145,7 +144,7 @@ def db_staffing_migration(nren_dict):
            non_technical_fte=0
        )
-    function_data = parse_excel_data.fetch_staff_function_excel_data()
+    function_data = excel_parser.fetch_staff_function_data()
    for (abbrev, year, technical_fte, non_technical_fte) in function_data:
        if abbrev not in nren_dict:
            logger.warning(f'{abbrev} unknown. Skipping staff function data.')
@@ -179,7 +178,7 @@ def db_staffing_migration(nren_dict):
 def db_ecprojects_migration(nren_dict):
-    ecproject_data = parse_excel_data.fetch_ecproject_excel_data()
+    ecproject_data = excel_parser.fetch_ecproject_data()
    for (abbrev, year, project) in ecproject_data:
        if abbrev not in nren_dict:
            logger.warning(f'{abbrev} unknown. Skipping.')
@@ -192,7 +191,7 @@ def db_ecprojects_migration(nren_dict):
 def db_organizations_migration(nren_dict):
-    organization_data = parse_excel_data.fetch_organization_excel_data()
+    organization_data = excel_parser.fetch_organization_data()
    for (abbrev, year, org) in organization_data:
        if abbrev not in nren_dict:
            logger.warning(f'{abbrev} unknown. Skipping.')

--- a/compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx
+++ b/compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx
--- a/compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx
+++ b/compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx
--- a/compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx
+++ b/compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx
--- a/compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx
+++ b/compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx
--- a/test/test_excel_publisher.py
+++ b/test/test_excel_publisher.py
+import os
+from sqlalchemy import select, func
+from compendium_v2 import db
+from compendium_v2.db import model
+from compendium_v2.publishers.excel_publisher import _cli
+EXCEL_FILE = os.path.join(os.path.dirname(__file__), "data", "2021_Organisation_DataSeries.xlsx")
+def test_publisher(app_with_survey_db, mocker, dummy_config):
+    mocker.patch('compendium_v2.publishers.excel_parser.EXCEL_ORGANISATION_2021', EXCEL_FILE)
+    with app_with_survey_db.app_context():
+        nren_names = ['SURF', 'KIFU', 'University of Malta', 'ASNET-AM', 'SIKT', 'LAT', 'RASH', 'ANAS', 'GRNET', 'CSC']
+        db.session.add_all([model.NREN(name=nren_name, country='country') for nren_name in nren_names])
+        db.session.commit()
+    _cli(dummy_config, app_with_survey_db)
+    with app_with_survey_db.app_context():
+        budget_count = db.session.scalar(select(func.count(model.BudgetEntry.year)))
+        assert budget_count
+        funding_source_count = db.session.scalar(select(func.count(model.FundingSource.year)))
+        assert funding_source_count
+        charging_structure_count = db.session.scalar(select(func.count(model.ChargingStructure.year)))
+        assert charging_structure_count
+        staff_data = db.session.scalars(select(model.NrenStaff).order_by(model.NrenStaff.year.asc())).all()
+        # data should only be saved for the NRENs we have saved in the database
+        staff_data_nrens = set([staff.nren.name for staff in staff_data])
+        assert len(staff_data_nrens) == len(nren_names) - 1  # no UoM data
+        kifu_data = [staff for staff in staff_data if staff.nren.name == 'KIFU']
+        # check that the data is saved correctly for KIFU, it should be OK for the rest then..
+        assert len(kifu_data) == 6
+        assert kifu_data[0].year == 2016
+        assert kifu_data[0].permanent_fte == 100
+        assert kifu_data[0].subcontracted_fte == 2
+        assert kifu_data[0].technical_fte == 0
+        assert kifu_data[0].non_technical_fte == 0
+        assert kifu_data[1].year == 2017
+        assert kifu_data[1].permanent_fte == 80
+        assert kifu_data[1].subcontracted_fte == 2
+        assert kifu_data[1].technical_fte == 0
+        assert kifu_data[1].non_technical_fte == 0
+        assert kifu_data[2].year == 2018
+        assert kifu_data[2].permanent_fte == 80
+        assert kifu_data[2].subcontracted_fte == 3
+        assert kifu_data[2].technical_fte == 0
+        assert kifu_data[2].non_technical_fte == 0
+        assert kifu_data[3].year == 2019
+        assert kifu_data[3].permanent_fte == 148
+        assert kifu_data[3].subcontracted_fte == 4
+        assert kifu_data[3].technical_fte == 117
+        assert kifu_data[3].non_technical_fte == 33
+        assert kifu_data[4].year == 2020
+        assert kifu_data[4].permanent_fte == 190
+        assert kifu_data[4].subcontracted_fte == 3
+        assert kifu_data[4].technical_fte == 133
+        assert kifu_data[4].non_technical_fte == 60
+        assert kifu_data[5].year == 2021
+        assert kifu_data[5].permanent_fte == 178
+        assert kifu_data[5].subcontracted_fte == 3
+        assert kifu_data[5].technical_fte == 133
+        assert kifu_data[5].non_technical_fte == 45
+        ecproject_data = db.session.scalars(select(model.ECProject)).all()
+        # test a couple of random entries
+        surf2017 = [x for x in ecproject_data if x.nren.name == 'SURF' and x.year == 2017]
+        assert len(surf2017) == 1
+        assert surf2017[0].project == 'Asterics and Magic'
+        asnetam2018 = [x for x in ecproject_data if x.nren.name == 'ASNET-AM' and x.year == 2018]
+        assert len(asnetam2018) == 1
+        assert asnetam2018[0].project == 'EaPConnect'
+        kifu2019 = [x for x in ecproject_data if x.nren.name == 'KIFU' and x.year == 2019]
+        assert len(kifu2019) == 4
+        assert kifu2019[3].project == 'SuperHeroes for Science'
+        parent_data = db.session.scalars(select(model.ParentOrganization)).all()
+        # test a random entry
+        asnet2021 = [x for x in parent_data if x.nren.name == 'ASNET-AM' and x.year == 2021]
+        assert len(asnet2021) == 1
+        assert asnet2021[0].organization\
+            == 'Institute for Informatics and Automation Problems of the National Academy of Sciences of Armenia'