From fef8db90b841646110ef035b63c82e2b30d8320d Mon Sep 17 00:00:00 2001
From: Mohammad Torkashvand <mohammad.torkashvand@geant.org>
Date: Thu, 14 Sep 2023 13:10:48 +0200
Subject: [PATCH] refactor publisher and make excel files consistent

---
 compendium_v2/background_task/__init__.py     |   0
 compendium_v2/conversion/conversion.py        |   6 +-
 ...rvey_publisher_2022.py => db_publisher.py} |   0
 .../excel_parser.py}                          |  37 +++----
 ...vey_publisher_v1.py => excel_publisher.py} |  21 ++--
 .../2021_Organisation_DataSeries.xlsx         | Bin
 .../2022_Connected_Users_DataSeries.xlsx      | Bin
 .../2022_Networks_DataSeries.xlsx             | Bin
 ...NREN-Services-prefills_2023_Recovered.xlsx | Bin
 test/test_excel_publisher.py                  |  94 ++++++++++++++++++
 10 files changed, 128 insertions(+), 30 deletions(-)
 delete mode 100644 compendium_v2/background_task/__init__.py
 rename compendium_v2/publishers/{survey_publisher_2022.py => db_publisher.py} (100%)
 rename compendium_v2/{background_task/parse_excel_data.py => publishers/excel_parser.py} (91%)
 rename compendium_v2/publishers/{survey_publisher_v1.py => excel_publisher.py} (92%)
 rename compendium_v2/{background_task/xlsx => resources}/2021_Organisation_DataSeries.xlsx (100%)
 rename compendium_v2/{background_task/xlsx => resources}/2022_Connected_Users_DataSeries.xlsx (100%)
 rename compendium_v2/{background_task/xlsx => resources}/2022_Networks_DataSeries.xlsx (100%)
 rename compendium_v2/{conversion => resources}/NREN-Services-prefills_2023_Recovered.xlsx (100%)
 create mode 100644 test/test_excel_publisher.py

diff --git a/compendium_v2/background_task/__init__.py b/compendium_v2/background_task/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/compendium_v2/conversion/conversion.py b/compendium_v2/conversion/conversion.py
index a963b446..615165b0 100644
--- a/compendium_v2/conversion/conversion.py
+++ b/compendium_v2/conversion/conversion.py
@@ -30,7 +30,9 @@ setup_logging()
 
 logger = logging.getLogger('conversion')
 
-EXCEL_FILE = os.path.join(os.path.dirname(__file__), "NREN-Services-prefills_2023_Recovered.xlsx")
+resources_dir = f"{os.path.abspath(os.path.join( os.path.dirname(__file__), os.pardir))}/resources"
+
+EXCEL_NREN_SERVICES_2023 = os.path.join(resources_dir, "NREN-Services-prefills_2023_Recovered.xlsx")
 
 
 def query_nren(nren_id: int):
@@ -112,7 +114,7 @@ def convert_answers(answers):
 
 
 def load_service_data():
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_NREN_SERVICES_2023, data_only=True, read_only=True)
     ws = wb["Sheet1"]
     rows = list(ws.rows)
 
diff --git a/compendium_v2/publishers/survey_publisher_2022.py b/compendium_v2/publishers/db_publisher.py
similarity index 100%
rename from compendium_v2/publishers/survey_publisher_2022.py
rename to compendium_v2/publishers/db_publisher.py
diff --git a/compendium_v2/background_task/parse_excel_data.py b/compendium_v2/publishers/excel_parser.py
similarity index 91%
rename from compendium_v2/background_task/parse_excel_data.py
rename to compendium_v2/publishers/excel_parser.py
index aba3ef6f..c1139266 100644
--- a/compendium_v2/background_task/parse_excel_data.py
+++ b/compendium_v2/publishers/excel_parser.py
@@ -9,14 +9,17 @@ setup_logging()
 
 logger = logging.getLogger(__name__)
 
-EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2021_Organisation_DataSeries.xlsx")
-NETWORK_EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2022_Networks_DataSeries.xlsx")
+resources_dir = f"{os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))}/resources"
 
+EXCEL_ORGANISATION_2021 = os.path.join(resources_dir, "2021_Organisation_DataSeries.xlsx")
+EXCEL_CONNECTED_USERS_2022 = os.path.join(resources_dir, "2022_Connected_Users_DataSeries.xlsx")
+EXCEL_NETWORKS_2022 = os.path.join(os.path.dirname(__file__), "2022_Networks_DataSeries.xlsx")
 
-def fetch_budget_excel_data():
+
+def fetch_budget_data():
     # load the xlsx file
     sheet_name = "1. Budget"
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     ws = wb[sheet_name]
@@ -34,9 +37,9 @@ def fetch_budget_excel_data():
                 yield nren.upper(), budget, year
 
 
-def fetch_funding_excel_data():
+def fetch_funding_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "2. Income Sources"
@@ -117,9 +120,9 @@ def fetch_funding_excel_data():
     yield from create_points_for_year_from_2018(ws2, 8, 51, 2021, 11, 12)
 
 
-def fetch_charging_structure_excel_data():
+def fetch_charging_structure_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "3. Charging mechanism"
@@ -184,9 +187,9 @@ def fetch_charging_structure_excel_data():
     yield from create_points_for_2019(3, 46, 2019, 6)
 
 
-def fetch_staffing_excel_data():
+def fetch_staffing_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "4. Staff"
@@ -232,9 +235,9 @@ def fetch_staffing_excel_data():
     yield from create_points_for_year(2021, 2, 5)
 
 
-def fetch_staff_function_excel_data():
+def fetch_staff_function_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "5. Staff by Function"
@@ -300,9 +303,9 @@ def fetch_staff_function_excel_data():
     yield from create_points_for_year(2021, 3, 5)
 
 
-def fetch_ecproject_excel_data():
+def fetch_ecproject_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "7. EC Projects"
@@ -332,9 +335,9 @@ def fetch_ecproject_excel_data():
     yield from create_points_for_year(2021, 1, 173)
 
 
-def fetch_organization_excel_data():
+def fetch_organization_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_ORGANISATION_2021, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "Organization"
@@ -352,7 +355,7 @@ def fetch_organization_excel_data():
 
 def fetch_traffic_excel_data():
     # load the xlsx file
-    wb = openpyxl.load_workbook(NETWORK_EXCEL_FILE, data_only=True, read_only=True)
+    wb = openpyxl.load_workbook(EXCEL_NETWORKS_2022, data_only=True, read_only=True)
 
     # select the active worksheet
     sheet_name = "Estimated_Traffic TByte"
diff --git a/compendium_v2/publishers/survey_publisher_v1.py b/compendium_v2/publishers/excel_publisher.py
similarity index 92%
rename from compendium_v2/publishers/survey_publisher_v1.py
rename to compendium_v2/publishers/excel_publisher.py
index a6e2376f..b7843c28 100644
--- a/compendium_v2/publishers/survey_publisher_v1.py
+++ b/compendium_v2/publishers/excel_publisher.py
@@ -1,5 +1,5 @@
 """
-survey_publisher_v1
+excel_publisher
 =========================
 
 This module loads the survey data from before 2022 from an excel file.
@@ -16,15 +16,14 @@ from sqlalchemy import select
 
 import compendium_v2
 from compendium_v2.environment import setup_logging
-from compendium_v2.background_task import parse_excel_data
 from compendium_v2.config import load
 from compendium_v2.db import db, model
 from compendium_v2.survey_db import model as survey_model
-from compendium_v2.publishers import helpers
+from compendium_v2.publishers import helpers, excel_parser
 
 setup_logging()
 
-logger = logging.getLogger('survey-publisher-v1')
+logger = logging.getLogger(__name__)
 
 
 def db_budget_migration(nren_dict):
@@ -55,7 +54,7 @@ def db_budget_migration(nren_dict):
             db.session.merge(budget_entry)
 
     # Import the data from excel sheet to database
-    exceldata = parse_excel_data.fetch_budget_excel_data()
+    exceldata = excel_parser.fetch_budget_data()
 
     for abbrev, budget, year in exceldata:
         if abbrev not in nren_dict:
@@ -77,7 +76,7 @@ def db_budget_migration(nren_dict):
 
 def db_funding_migration(nren_dict):
     # Import the data to database
-    data = parse_excel_data.fetch_funding_excel_data()
+    data = excel_parser.fetch_funding_data()
 
     for (abbrev, year, client_institution,
             european_funding,
@@ -108,7 +107,7 @@ def db_funding_migration(nren_dict):
 
 def db_charging_structure_migration(nren_dict):
     # Import the data to database
-    data = parse_excel_data.fetch_charging_structure_excel_data()
+    data = excel_parser.fetch_charging_structure_data()
 
     for (abbrev, year, charging_structure) in data:
         if abbrev not in nren_dict:
@@ -126,7 +125,7 @@ def db_charging_structure_migration(nren_dict):
 
 
 def db_staffing_migration(nren_dict):
-    staff_data = parse_excel_data.fetch_staffing_excel_data()
+    staff_data = excel_parser.fetch_staffing_data()
 
     nren_staff_map = {}
     for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data:
@@ -145,7 +144,7 @@ def db_staffing_migration(nren_dict):
             non_technical_fte=0
         )
 
-    function_data = parse_excel_data.fetch_staff_function_excel_data()
+    function_data = excel_parser.fetch_staff_function_data()
     for (abbrev, year, technical_fte, non_technical_fte) in function_data:
         if abbrev not in nren_dict:
             logger.warning(f'{abbrev} unknown. Skipping staff function data.')
@@ -179,7 +178,7 @@ def db_staffing_migration(nren_dict):
 
 
 def db_ecprojects_migration(nren_dict):
-    ecproject_data = parse_excel_data.fetch_ecproject_excel_data()
+    ecproject_data = excel_parser.fetch_ecproject_data()
     for (abbrev, year, project) in ecproject_data:
         if abbrev not in nren_dict:
             logger.warning(f'{abbrev} unknown. Skipping.')
@@ -192,7 +191,7 @@ def db_ecprojects_migration(nren_dict):
 
 
 def db_organizations_migration(nren_dict):
-    organization_data = parse_excel_data.fetch_organization_excel_data()
+    organization_data = excel_parser.fetch_organization_data()
     for (abbrev, year, org) in organization_data:
         if abbrev not in nren_dict:
             logger.warning(f'{abbrev} unknown. Skipping.')
diff --git a/compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx b/compendium_v2/resources/2021_Organisation_DataSeries.xlsx
similarity index 100%
rename from compendium_v2/background_task/xlsx/2021_Organisation_DataSeries.xlsx
rename to compendium_v2/resources/2021_Organisation_DataSeries.xlsx
diff --git a/compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx b/compendium_v2/resources/2022_Connected_Users_DataSeries.xlsx
similarity index 100%
rename from compendium_v2/background_task/xlsx/2022_Connected_Users_DataSeries.xlsx
rename to compendium_v2/resources/2022_Connected_Users_DataSeries.xlsx
diff --git a/compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx b/compendium_v2/resources/2022_Networks_DataSeries.xlsx
similarity index 100%
rename from compendium_v2/background_task/xlsx/2022_Networks_DataSeries.xlsx
rename to compendium_v2/resources/2022_Networks_DataSeries.xlsx
diff --git a/compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx b/compendium_v2/resources/NREN-Services-prefills_2023_Recovered.xlsx
similarity index 100%
rename from compendium_v2/conversion/NREN-Services-prefills_2023_Recovered.xlsx
rename to compendium_v2/resources/NREN-Services-prefills_2023_Recovered.xlsx
diff --git a/test/test_excel_publisher.py b/test/test_excel_publisher.py
new file mode 100644
index 00000000..de0bc41b
--- /dev/null
+++ b/test/test_excel_publisher.py
@@ -0,0 +1,94 @@
+import os
+
+from sqlalchemy import select, func
+
+from compendium_v2 import db
+from compendium_v2.db import model
+from compendium_v2.publishers.excel_publisher import _cli
+
+EXCEL_FILE = os.path.join(os.path.dirname(__file__), "data", "2021_Organisation_DataSeries.xlsx")
+
+
+def test_publisher(app_with_survey_db, mocker, dummy_config):
+    mocker.patch('compendium_v2.publishers.excel_parser.EXCEL_ORGANISATION_2021', EXCEL_FILE)
+
+    with app_with_survey_db.app_context():
+        nren_names = ['SURF', 'KIFU', 'University of Malta', 'ASNET-AM', 'SIKT', 'LAT', 'RASH', 'ANAS', 'GRNET', 'CSC']
+        db.session.add_all([model.NREN(name=nren_name, country='country') for nren_name in nren_names])
+        db.session.commit()
+
+    _cli(dummy_config, app_with_survey_db)
+
+    with app_with_survey_db.app_context():
+        budget_count = db.session.scalar(select(func.count(model.BudgetEntry.year)))
+        assert budget_count
+        funding_source_count = db.session.scalar(select(func.count(model.FundingSource.year)))
+        assert funding_source_count
+        charging_structure_count = db.session.scalar(select(func.count(model.ChargingStructure.year)))
+        assert charging_structure_count
+        staff_data = db.session.scalars(select(model.NrenStaff).order_by(model.NrenStaff.year.asc())).all()
+
+        # data should only be saved for the NRENs we have saved in the database
+        staff_data_nrens = set([staff.nren.name for staff in staff_data])
+        assert len(staff_data_nrens) == len(nren_names) - 1  # no UoM data
+
+        kifu_data = [staff for staff in staff_data if staff.nren.name == 'KIFU']
+        # check that the data is saved correctly for KIFU, it should be OK for the rest then..
+        assert len(kifu_data) == 6
+
+        assert kifu_data[0].year == 2016
+        assert kifu_data[0].permanent_fte == 100
+        assert kifu_data[0].subcontracted_fte == 2
+        assert kifu_data[0].technical_fte == 0
+        assert kifu_data[0].non_technical_fte == 0
+
+        assert kifu_data[1].year == 2017
+        assert kifu_data[1].permanent_fte == 80
+        assert kifu_data[1].subcontracted_fte == 2
+        assert kifu_data[1].technical_fte == 0
+        assert kifu_data[1].non_technical_fte == 0
+
+        assert kifu_data[2].year == 2018
+        assert kifu_data[2].permanent_fte == 80
+        assert kifu_data[2].subcontracted_fte == 3
+        assert kifu_data[2].technical_fte == 0
+        assert kifu_data[2].non_technical_fte == 0
+
+        assert kifu_data[3].year == 2019
+        assert kifu_data[3].permanent_fte == 148
+        assert kifu_data[3].subcontracted_fte == 4
+        assert kifu_data[3].technical_fte == 117
+        assert kifu_data[3].non_technical_fte == 33
+
+        assert kifu_data[4].year == 2020
+        assert kifu_data[4].permanent_fte == 190
+        assert kifu_data[4].subcontracted_fte == 3
+        assert kifu_data[4].technical_fte == 133
+        assert kifu_data[4].non_technical_fte == 60
+
+        assert kifu_data[5].year == 2021
+        assert kifu_data[5].permanent_fte == 178
+        assert kifu_data[5].subcontracted_fte == 3
+        assert kifu_data[5].technical_fte == 133
+        assert kifu_data[5].non_technical_fte == 45
+
+        ecproject_data = db.session.scalars(select(model.ECProject)).all()
+        # test a couple of random entries
+        surf2017 = [x for x in ecproject_data if x.nren.name == 'SURF' and x.year == 2017]
+        assert len(surf2017) == 1
+        assert surf2017[0].project == 'Asterics and Magic'
+
+        asnetam2018 = [x for x in ecproject_data if x.nren.name == 'ASNET-AM' and x.year == 2018]
+        assert len(asnetam2018) == 1
+        assert asnetam2018[0].project == 'EaPConnect'
+
+        kifu2019 = [x for x in ecproject_data if x.nren.name == 'KIFU' and x.year == 2019]
+        assert len(kifu2019) == 4
+        assert kifu2019[3].project == 'SuperHeroes for Science'
+
+        parent_data = db.session.scalars(select(model.ParentOrganization)).all()
+        # test a random entry
+        asnet2021 = [x for x in parent_data if x.nren.name == 'ASNET-AM' and x.year == 2021]
+        assert len(asnet2021) == 1
+        assert asnet2021[0].organization\
+            == 'Institute for Informatics and Automation Problems of the National Academy of Sciences of Armenia'
-- 
GitLab