From eb0cb63e8adbf1c59fa6d6ee6dc58a6de5a35c2d Mon Sep 17 00:00:00 2001 From: Remco Tukker <remco.tukker@geant.org> Date: Tue, 2 May 2023 10:16:39 +0200 Subject: [PATCH 1/2] make sure all data is read from the excel and improve logging a little bit --- .../background_task/parse_excel_data.py | 20 +++++++------- .../publishers/survey_publisher_v1.py | 27 ++++++++++--------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/compendium_v2/background_task/parse_excel_data.py b/compendium_v2/background_task/parse_excel_data.py index cf060409..993ece98 100644 --- a/compendium_v2/background_task/parse_excel_data.py +++ b/compendium_v2/background_task/parse_excel_data.py @@ -20,8 +20,8 @@ def fetch_budget_excel_data(): # select the active worksheet ws = wb[sheet_name] # iterate over the rows in the worksheet - for row in range(14, 57): - for col in range(3, 8): + for row in range(14, 58): + for col in range(3, 9): # extract the data from the row nren = ws.cell(row=row, column=2).value budget = ws.cell(row=row, column=col).value @@ -29,8 +29,6 @@ def fetch_budget_excel_data(): if budget is not None: budget = round(budget / 1000000, 2) - if budget > 200: - logger.info(f'{nren} has budget set to >200M EUR for {year}. ({budget})') yield nren.upper(), budget, year @@ -99,19 +97,19 @@ def fetch_funding_excel_data(): yield (nren.upper(), year, client_institution, european_funding, gov_public_bodies, commercial, other) # For 2016 - yield from create_points_for_year_until_2017(8, 50, 2016, 43, 45) + yield from create_points_for_year_until_2017(8, 51, 2016, 43, 45) # For 2017 - yield from create_points_for_year_until_2017(8, 50, 2017, 32, 35) + yield from create_points_for_year_until_2017(8, 51, 2017, 32, 35) # For 2018 - yield from create_points_for_year_from_2018(8, 50, 2018, 21) + yield from create_points_for_year_from_2018(8, 51, 2018, 21) # For 2019 - yield from create_points_for_year_from_2018(8, 50, 2019, 12) + yield from create_points_for_year_from_2018(8, 51, 2019, 12) # For 2020 - yield from create_points_for_year_from_2018(8, 50, 2020, 3) + yield from create_points_for_year_from_2018(8, 51, 2020, 3) def fetch_charging_structure_excel_data(): @@ -148,10 +146,10 @@ def fetch_charging_structure_excel_data(): yield nren.upper(), year, charging_structure # For 2021 - yield from create_points_for_year(3, 45, 2021, 2) + yield from create_points_for_year(3, 46, 2021, 2) # For 2019 - yield from create_points_for_year(3, 45, 2019, 6) + yield from create_points_for_year(3, 46, 2019, 6) def fetch_staffing_excel_data(): diff --git a/compendium_v2/publishers/survey_publisher_v1.py b/compendium_v2/publishers/survey_publisher_v1.py index a3fd3d98..ede96547 100644 --- a/compendium_v2/publishers/survey_publisher_v1.py +++ b/compendium_v2/publishers/survey_publisher_v1.py @@ -38,10 +38,10 @@ def db_budget_migration(): year = budget.year if float(budget.budget) > 200: - logger.info(f'Incorrect Data: {abbrev} has budget set to >200M EUR for {year}. ({budget.budget})') + logger.warn(f'Incorrect Data: {abbrev} has budget set to >200M EUR for {year}. ({budget.budget})') if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping.') + logger.warn(f'{abbrev} unknown. Skipping.') continue budget_entry = model.BudgetEntry( @@ -56,9 +56,12 @@ def db_budget_migration(): for abbrev, budget, year in exceldata: if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping.') + logger.warn(f'{abbrev} unknown. Skipping.') continue + if budget > 200: + logger.warn(f'{nren} has budget set to >200M EUR for {year}. ({budget})') + budget_entry = model.BudgetEntry(nren=nren_dict[abbrev], budget=budget, year=year) session.merge(budget_entry) session.commit() @@ -78,11 +81,11 @@ def db_funding_migration(): _data = [client_institution, european_funding, gov_public_bodies, commercial, other] total = sum(_data) - if not math.isclose(total, 100, abs_tol=0.01): - logger.info(f'{abbrev} funding sources for {year} do not sum to 100% ({total})') + if not math.isclose(total, 100, abs_tol=0.01) and total != 0: + logger.warn(f'{abbrev} funding sources for {year} do not sum to 100% ({total})') if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping.') + logger.warn(f'{abbrev} unknown. Skipping.') continue budget_entry = model.FundingSource( @@ -106,7 +109,7 @@ def db_charging_structure_migration(): for (abbrev, year, charging_structure) in data: if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping.') + logger.warn(f'{abbrev} unknown. Skipping.') continue charging_structure_entry = model.ChargingStructure( @@ -124,7 +127,7 @@ def db_staffing_migration(): nren_staff_map = {} for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data: if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping staff data.') + logger.warn(f'{abbrev} unknown. Skipping staff data.') continue nren = nren_dict[abbrev] @@ -141,7 +144,7 @@ def db_staffing_migration(): function_data = parse_excel_data.fetch_staff_function_excel_data() for (abbrev, year, technical_fte, non_technical_fte) in function_data: if abbrev not in nren_dict: - logger.info(f'{abbrev} unknown. Skipping staff function data.') + logger.warn(f'{abbrev} unknown. Skipping staff function data.') continue nren = nren_dict[abbrev] @@ -162,9 +165,9 @@ def db_staffing_migration(): for nren_staff_model in nren_staff_map.values(): employed = nren_staff_model.permanent_fte + nren_staff_model.subcontracted_fte technical = nren_staff_model.technical_fte + nren_staff_model.non_technical_fte - if not math.isclose(employed, technical, abs_tol=0.01): - logger.info(f'{nren_staff_model.nren.name} in {nren_staff_model.year}:' - f' FTE do not equal across employed/technical categories ({employed} != {technical})') + if not math.isclose(employed, technical, abs_tol=0.01) and employed != 0 and technical != 0: + logger.warn(f'{nren_staff_model.nren.name} in {nren_staff_model.year}:' + f' FTE do not equal across employed/technical categories ({employed} != {technical})') session.merge(nren_staff_model) -- GitLab From d3e16659e7d202a8917f49f36c3968374d2fe60f Mon Sep 17 00:00:00 2001 From: Remco Tukker <remco.tukker@geant.org> Date: Tue, 2 May 2023 13:40:05 +0200 Subject: [PATCH 2/2] added v1 publish for ecprojects and parent org data --- .../background_task/parse_excel_data.py | 50 +++++++++++++++++ .../publishers/survey_publisher_v1.py | 56 +++++++++++++++---- test/test_survey_publisher_v1.py | 21 +++++++ 3 files changed, 116 insertions(+), 11 deletions(-) diff --git a/compendium_v2/background_task/parse_excel_data.py b/compendium_v2/background_task/parse_excel_data.py index 993ece98..51ec2efe 100644 --- a/compendium_v2/background_task/parse_excel_data.py +++ b/compendium_v2/background_task/parse_excel_data.py @@ -266,3 +266,53 @@ def fetch_staff_function_excel_data(): # For 2021 yield from create_points_for_year(2021, 3, 5) + + +def fetch_ecproject_excel_data(): + # load the xlsx file + wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + + # select the active worksheet + sheet_name = "7. EC Projects" + ws = wb[sheet_name] + + start_row = 6 + + def create_points_for_year(year, start_column, end_row): + for row in range(start_row, end_row): + # extract the data from the row + nren = ws.cell(row=row, column=start_column).value + if nren is None: + continue + project = ws.cell(row=row, column=start_column + 1).value + if project is None: + continue + yield nren.upper(), year, project + + yield from create_points_for_year(2017, 13, 165) + + yield from create_points_for_year(2018, 10, 165) + + yield from create_points_for_year(2019, 7, 165) + + yield from create_points_for_year(2020, 4, 180) + + yield from create_points_for_year(2021, 1, 173) + + +def fetch_organization_excel_data(): + # load the xlsx file + wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + + # select the active worksheet + sheet_name = "Organization" + ws = wb[sheet_name] + + # iterate over the rows in the worksheet + for row in range(5, 48): + # extract the data from the row + nren = ws.cell(row=row, column=2).value + parent_org = ws.cell(row=row, column=4).value + + if parent_org not in [None, 'NA', 'N/A']: + yield nren.upper(), 2021, parent_org diff --git a/compendium_v2/publishers/survey_publisher_v1.py b/compendium_v2/publishers/survey_publisher_v1.py index ede96547..ba0e4e2a 100644 --- a/compendium_v2/publishers/survey_publisher_v1.py +++ b/compendium_v2/publishers/survey_publisher_v1.py @@ -2,7 +2,7 @@ survey_publisher_v1 ========================= -This module loads the survey data from before 2022 from and excel file. +This module loads the survey data from before 2022 from an excel file. Missing info is filled in from the survey db for some questions. Registered as click cli command when installing compendium-v2. @@ -38,10 +38,10 @@ def db_budget_migration(): year = budget.year if float(budget.budget) > 200: - logger.warn(f'Incorrect Data: {abbrev} has budget set to >200M EUR for {year}. ({budget.budget})') + logger.warning(f'Incorrect Data: {abbrev} has budget set >200M EUR for {year}. ({budget.budget})') if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping.') + logger.warning(f'{abbrev} unknown. Skipping.') continue budget_entry = model.BudgetEntry( @@ -56,11 +56,11 @@ def db_budget_migration(): for abbrev, budget, year in exceldata: if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping.') + logger.warning(f'{abbrev} unknown. Skipping.') continue if budget > 200: - logger.warn(f'{nren} has budget set to >200M EUR for {year}. ({budget})') + logger.warning(f'{nren} has budget set to >200M EUR for {year}. ({budget})') budget_entry = model.BudgetEntry(nren=nren_dict[abbrev], budget=budget, year=year) session.merge(budget_entry) @@ -82,10 +82,10 @@ def db_funding_migration(): _data = [client_institution, european_funding, gov_public_bodies, commercial, other] total = sum(_data) if not math.isclose(total, 100, abs_tol=0.01) and total != 0: - logger.warn(f'{abbrev} funding sources for {year} do not sum to 100% ({total})') + logger.warning(f'{abbrev} funding sources for {year} do not sum to 100% ({total})') if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping.') + logger.warning(f'{abbrev} unknown. Skipping.') continue budget_entry = model.FundingSource( @@ -109,7 +109,7 @@ def db_charging_structure_migration(): for (abbrev, year, charging_structure) in data: if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping.') + logger.warning(f'{abbrev} unknown. Skipping.') continue charging_structure_entry = model.ChargingStructure( @@ -127,7 +127,7 @@ def db_staffing_migration(): nren_staff_map = {} for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data: if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping staff data.') + logger.warning(f'{abbrev} unknown. Skipping staff data.') continue nren = nren_dict[abbrev] @@ -144,7 +144,7 @@ def db_staffing_migration(): function_data = parse_excel_data.fetch_staff_function_excel_data() for (abbrev, year, technical_fte, non_technical_fte) in function_data: if abbrev not in nren_dict: - logger.warn(f'{abbrev} unknown. Skipping staff function data.') + logger.warning(f'{abbrev} unknown. Skipping staff function data.') continue nren = nren_dict[abbrev] @@ -166,7 +166,7 @@ def db_staffing_migration(): employed = nren_staff_model.permanent_fte + nren_staff_model.subcontracted_fte technical = nren_staff_model.technical_fte + nren_staff_model.non_technical_fte if not math.isclose(employed, technical, abs_tol=0.01) and employed != 0 and technical != 0: - logger.warn(f'{nren_staff_model.nren.name} in {nren_staff_model.year}:' + logger.warning(f'{nren_staff_model.nren.name} in {nren_staff_model.year}:' f' FTE do not equal across employed/technical categories ({employed} != {technical})') session.merge(nren_staff_model) @@ -174,12 +174,46 @@ def db_staffing_migration(): session.commit() +def db_ecprojects_migration(): + with db.session_scope() as session: + nren_dict = helpers.get_uppercase_nren_dict(session) + + ecproject_data = parse_excel_data.fetch_ecproject_excel_data() + for (abbrev, year, project) in ecproject_data: + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + ecproject_entry = model.ECProject(nren=nren, nren_id=nren.id, year=year, project=project) + session.merge(ecproject_entry) + session.commit() + + +def db_organizations_migration(): + with db.session_scope() as session: + nren_dict = helpers.get_uppercase_nren_dict(session) + + organization_data = parse_excel_data.fetch_organization_excel_data() + for (abbrev, year, org) in organization_data: + if abbrev not in nren_dict: + logger.warning(f'{abbrev} unknown. Skipping.') + continue + + nren = nren_dict[abbrev] + org_entry = model.ParentOrganization(nren=nren, nren_id=nren.id, year=year, organization=org) + session.merge(org_entry) + session.commit() + + def _cli(config): helpers.init_db(config) db_budget_migration() db_funding_migration() db_charging_structure_migration() db_staffing_migration() + db_ecprojects_migration() + db_organizations_migration() @click.command() diff --git a/test/test_survey_publisher_v1.py b/test/test_survey_publisher_v1.py index b712812b..c7ebc2d9 100644 --- a/test/test_survey_publisher_v1.py +++ b/test/test_survey_publisher_v1.py @@ -68,3 +68,24 @@ def test_publisher(client, mocker, dummy_config): assert kifu_data[5].subcontracted_fte == 3 assert kifu_data[5].technical_fte == 133 assert kifu_data[5].non_technical_fte == 45 + + ecproject_data = session.query(model.ECProject).all() + # test a couple of random entries + surf2017 = [x for x in ecproject_data if x.nren.name == 'SURF' and x.year == 2017] + assert len(surf2017) == 1 + assert surf2017[0].project == 'Asterics and Magic' + + asnetam2018 = [x for x in ecproject_data if x.nren.name == 'ASNET-AM' and x.year == 2018] + assert len(asnetam2018) == 1 + assert asnetam2018[0].project == 'EaPConnect' + + kifu2019 = [x for x in ecproject_data if x.nren.name == 'KIFU' and x.year == 2019] + assert len(kifu2019) == 4 + assert kifu2019[3].project == 'SuperHeroes for Science' + + parent_data = session.query(model.ParentOrganization).all() + # test a random entry + asnet2021 = [x for x in parent_data if x.nren.name == 'ASNET-AM' and x.year == 2021] + assert len(asnet2021) == 1 + assert asnet2021[0].organization\ + == 'Institute for Informatics and Automation Problems of the National Academy of Sciences of Armenia' -- GitLab