diff --git a/Changelog.md b/Changelog.md index 27ceea9a4fc339855cb41d095f9079c5853dbb89..e0141ac64c5a33e817b19ef441fc073ee38393b3 100644 --- a/Changelog.md +++ b/Changelog.md @@ -2,10 +2,12 @@ All notable changes to this project will be documented in this file. + ## [0.78] - 2025-01-13 - COMP-371: Add S&P - EOSC Listings page (unlisted due to bad data) - Only render http(s) links as links in URL tables - Get rid of redundant collapsible-column css & fix up some css +- Add survey-publisher-legacy script for converting old response data into new surveys (reusing 2024 survey template) ## [0.77] - 2025-01-10 - COMP-369: Add Network - Capacity - External IP Connections page diff --git a/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy.py b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy.py new file mode 100644 index 0000000000000000000000000000000000000000..305d81cef118488841bb768d41f75c0101f20c5f --- /dev/null +++ b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy.py @@ -0,0 +1,123 @@ +import click +from itertools import chain +from collections import defaultdict +from typing import Dict, Any + +import compendium_v2 +from compendium_v2.db import db +from compendium_v2.config import load +from compendium_v2.db.presentation_models import NREN +from compendium_v2.survey_db import model as survey_model +from compendium_v2.db.survey_models import SurveyResponse, Survey, SurveyStatus, ResponseStatus, SurveyNotes +from compendium_v2.publishers.legacy_publisher.survey_publisher_legacy_db import fetch_data as fetch_data_db +from compendium_v2.publishers.legacy_publisher.survey_publisher_legacy_excel import fetch_data as fetch_data_excel + + +def insert_survey_data(survey_2024: Survey, nren: NREN, year: int, answer: Dict[str, Any]): + # we're basing the generated survey on the 2024 survey, so we need to make sure that exists + # before we insert the responses. + survey = db.session.query(Survey).filter(Survey.year == year).first() + if not survey: + survey = Survey( + year=year, + survey=survey_2024.survey, + status=SurveyStatus.published + ) + db.session.add(survey) + response = db.session.query(SurveyResponse).filter(SurveyResponse.survey_year == + year, SurveyResponse.nren_id == nren.id).first() + + if not response: + # add some default values for the survey + answer['page'] = 1 + answer['verification_status'] = {} + + response = SurveyResponse( + survey=survey, + survey_year=year, + nren_id=nren.id, + nren=nren, + answers=answer, + status=ResponseStatus.completed + ) + + response_notes = SurveyNotes( + survey=response, + survey_year=year, + nren_id=nren.id, + notes="This survey has been imported by the legacy survey importer. Please review the data for accuracy.", + ) + + db.session.add(response) + db.session.add(response_notes) + db.session.commit() + + +def delete_surveys(): + db.session.query(SurveyNotes).filter(SurveyNotes.survey_year >= 2016, SurveyNotes.survey_year <= 2021).delete() + db.session.query(SurveyResponse).filter(SurveyResponse.survey_year >= + 2016, SurveyResponse.survey_year <= 2021).delete() + db.session.query(Survey).filter(Survey.year >= 2016, Survey.year <= 2021).delete() + db.session.commit() + + +def ensure_string_tree(tree: Dict[str, Any]): + # this function converts all object values to strings (from int, float, etc) + # if we encounter lists or dicts, we recurse into them. + + for key, value in list(tree.items()): + if isinstance(value, dict): + ensure_string_tree(value) + elif isinstance(value, list): + for i, val in list(enumerate(value)): + if isinstance(val, dict): + ensure_string_tree(val) + else: + value[i] = str(val) + else: + if value is None: + del tree[key] + else: + tree[key] = str(value) + return tree + + +@click.command() +@click.option('--config', type=click.STRING, default='config.json') +def cli(config): + app_config = load(open(config, 'r')) + + app_config['SQLALCHEMY_BINDS'] = {survey_model.SURVEY_DB_BIND: app_config['SURVEY_DATABASE_URI']} + + app = compendium_v2._create_app_with_db(app_config) + print("survey-publisher-legacy starting") + with app.app_context(): + all_responses = db.session.query(SurveyResponse).filter(SurveyResponse.survey_year == 2024).all() + + data = [resp.answers['data'] for resp in all_responses if 'data' in resp.answers] + + valid_keys = set(chain(*[a.keys() for a in data])) + + nren_map = defaultdict(lambda: defaultdict(lambda: {'data': {}})) + for data_key, nren, nren_id, year, value in chain(fetch_data_excel(), fetch_data_db()): + if data_key not in valid_keys: + print(f'Invalid data key: {data_key} for NREN: {nren} ({nren_id}) in year {year}') + nren_map[nren][year]['data'][data_key] = value + + # use this to gauge quality of a survey year: + # answers = [len(d['data']) for yearmap in nren_map.values() for year, d in yearmap.items() if year == 2018] + # sum(answers) / len(answers) + delete_surveys() + survey_2024 = db.session.query(Survey).filter(Survey.year == 2024).first() + for nren, years in nren_map.items(): + for year, data in years.items(): + if year < 2016 or year > 2021: + # data before 2016 is very sparse, so don't move it over + # we already have 2022 and above, so don't port those. + continue + data = ensure_string_tree(data) + insert_survey_data(survey_2024, nren, year, data) + + +if __name__ == "__main__": + cli() diff --git a/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_db.py b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_db.py new file mode 100644 index 0000000000000000000000000000000000000000..50a0b60b65ab321b26620973f51990efaa3688b9 --- /dev/null +++ b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_db.py @@ -0,0 +1,1307 @@ +""" +survey_publisher_old_db_2022 +============================ + +This module loads the survey data from 2022 from the old survey database and returns the data. + +""" +from decimal import Decimal +import logging +import enum +import json +import html +import itertools + +from sqlalchemy import text +from collections import defaultdict + +from compendium_v2.conversion.mapping import CHARGING_LEVELS, CONNECTION, INTERCONNECTION, SERVICE_USER_TYPE_TO_CODE +from compendium_v2.db.presentation_model_enums import CommercialCharges, CommercialConnectivityCoverage, \ + ConnectionMethod, FeeType, ServiceCategory, UserCategory, YesNoPlanned +from compendium_v2.environment import setup_logging +from compendium_v2.publishers.helpers import extract_urls, valid_url +from compendium_v2.survey_db import model as survey_model +from compendium_v2.db import db +from compendium_v2.publishers import helpers +from compendium_v2.conversion import mapping + +setup_logging() + +logger = logging.getLogger('survey-publisher-old-db-2022') + +BUDGET_QUERY = """ +SELECT DISTINCT ON (n.id, a.question_id) + n.abbreviation AS nren, + a.value AS budget +FROM answers a +JOIN nrens n ON a.nren_id = n.id +JOIN questions q ON a.question_id = q.id +JOIN sections s ON q.section_id = s.id +JOIN compendia c ON s.compendium_id = c.id +WHERE + a.question_id = 16402 +AND c.year = 2022 +ORDER BY n.id, a.question_id, a.updated_at DESC +""" + +QUESTION_TEMPLATE_QUERY = """ +SELECT DISTINCT ON (n.id, a.question_id) + n.abbreviation AS nren, + a.value AS value +FROM answers a +JOIN nrens n ON a.nren_id = n.id +JOIN questions q ON a.question_id = q.id +JOIN sections s ON q.section_id = s.id +JOIN compendia c ON s.compendium_id = c.id +WHERE + a.question_id = {} + AND c.year = {} + AND a.value NOT IN ('"NA"', '"N/A"', '[""]', '["-"]', '["/"]') +ORDER BY n.id, a.question_id, a.updated_at DESC +""" + +RECURSIVE_QUERY = """ + WITH RECURSIVE parent_questions AS ( + -- Base case + SELECT q.id, q.equivalent_question_id, c.year, q.title + FROM questions q + JOIN sections s ON q.section_id = s.id + JOIN compendia c ON s.compendium_id = c.id + WHERE q.id = {} + UNION ALL + -- Recursive case + SELECT q.id, q.equivalent_question_id, c.year, q.title + FROM questions q + INNER JOIN parent_questions pq ON q.id = pq.equivalent_question_id + JOIN sections s ON q.section_id = s.id + JOIN compendia c ON s.compendium_id = c.id) + SELECT DISTINCT ON (n.id, answers.question_id) answers.id, + UPPER(n.abbreviation) AS nren, + parent_questions.year, + answers.value as answer + FROM answers + JOIN parent_questions ON answers.question_id = parent_questions.id + JOIN nrens n on answers.nren_id = n.id + WHERE UPPER(answers.value) NOT IN ('"NA"', '"N/A"', '[""]', '["-"]', '["/"]', '/', '["NA"]', '""', '[]', '[n/a]') + ORDER BY n.id, answers.question_id, answers.updated_at DESC; +""" + + +class FundingSource(enum.Enum): + CLIENT_INSTITUTIONS = 16405 + EUROPEAN_FUNDING = 16406 + COMMERCIAL = 16407 + OTHER = 16408 + GOV_PUBLIC_BODIES = 16409 + + +class StaffQuestion(enum.Enum): + """ + Answers are numbers expressed in FTEs (full time equivalents) + """ + PERMANENT_FTE = 16414 + SUBCONTRACTED_FTE = 16413 + TECHNICAL_FTE = 16416 + NON_TECHNICAL_FTE = 16417 + + +class OrgQuestion(enum.Enum): + """ + Answers are strings + """ + PARENT_ORG_NAME = 16419 + + SUB_ORGS_1_NAME = 16422 + SUB_ORGS_1_CHOICE = 16449 + SUB_ORGS_1_ROLE = 16426 + + SUB_ORGS_2_NAME = 16429 + SUB_ORGS_2_CHOICE = 16448 + SUB_ORGS_2_ROLE = 16434 + + SUB_ORGS_3_NAME = 16430 + SUB_ORGS_3_CHOICE = 16446 + SUB_ORGS_3_ROLE = 16435 + + SUB_ORGS_4_NAME = 16432 + SUB_ORGS_4_CHOICE = 16451 + SUB_ORGS_4_ROLE = 16438 + + SUB_ORGS_5_NAME = 16433 + SUB_ORGS_5_CHOICE = 16450 + SUB_ORGS_5_ROLE = 16439 + + +class ECQuestion(enum.Enum): + EC_PROJECT = 16453 + + +class ChargingStructure(enum.Enum): + """ + Answers are strings + """ + charging_structure = 16410 + + +def query_budget(): + return db.session.execute(text(BUDGET_QUERY), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]}) + + +def recursive_query(question_id_2022): + assert question_id_2022 + query = RECURSIVE_QUERY.format(question_id_2022) + return db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]}) + + +def query_funding_sources(): + for source in FundingSource: + query = QUESTION_TEMPLATE_QUERY.format(source.value, 2022) + yield source, db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]}) + + +def query_question(question: enum.Enum): + return query_question_id(question.value) + + +def query_question_id(question_id: int, year: int = 2022): + query = QUESTION_TEMPLATE_QUERY.format(question_id, year) + return db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]}) + + +def budget(nren_dict): + rows = query_budget() + for row in rows: + nren_name = row[0].upper() + _budget = row[1] + try: + budget = float(_budget.replace('"', '').replace(',', '')) + except ValueError: + continue + + if nren_name not in nren_dict: + continue + + yield ('budget', nren_dict[nren_name], nren_dict[nren_name].id, 2022, budget) + + +def funding_sources(nren_dict): + sourcedata = {} + for source, data in query_funding_sources(): + for row in data: + nren_name = row[0].upper() + _value = row[1] + try: + value = float(_value.replace('"', '').replace(',', '')) + except ValueError: + value = 0 + + nren_info = sourcedata.setdefault( + nren_name, + {source_type: 0 for source_type in FundingSource} + ) + nren_info[source] = value + + for nren_name, nren_info in sourcedata.items(): + if nren_name not in nren_dict: + continue + + if nren_name == 'HEANET': + nren_info[FundingSource.OTHER] = nren_info[FundingSource.OTHER] + nren_info[FundingSource.COMMERCIAL] + nren_info[FundingSource.COMMERCIAL] = 0 + + data = { + 'client_institutions': nren_info[FundingSource.CLIENT_INSTITUTIONS], + 'european_funding': nren_info[FundingSource.EUROPEAN_FUNDING], + 'commercial': nren_info[FundingSource.COMMERCIAL], + 'other': nren_info[FundingSource.OTHER], + 'gov_public_bodies': nren_info[FundingSource.GOV_PUBLIC_BODIES], + } + + yield ('income_sources', nren_dict[nren_name], nren_dict[nren_name].id, 2022, data) + + +def charging_structure(nren_dict): + rows = query_question(ChargingStructure.charging_structure) + for row in rows: + nren_name = row[0].upper() + value = row[1].replace('"', '').strip() + + if nren_name not in nren_dict: + continue + + if "do not charge" in value: + charging_structure = FeeType.no_charge + elif "combination" in value: + charging_structure = FeeType.combination + elif "flat" in value: + charging_structure = FeeType.flat_fee + elif "usage-based" in value: + charging_structure = FeeType.usage_based_fee + elif "Other" in value: + charging_structure = FeeType.other + else: + charging_structure = None + + if charging_structure: + charging_structure = charging_structure.name + + yield ('charging_mechanism', nren_dict[nren_name], nren_dict[nren_name].id, 2022, charging_structure) + + +def staff_data(nren_dict): + data = {} + for question in StaffQuestion: + rows = query_question(question) + for row in rows: + nren_name = row[0].upper() + _value = row[1] + try: + value = float(_value.replace('"', '').replace(',', '')) + except ValueError: + value = 0 + + if nren_name not in nren_dict: + continue + + # initialize on first use, so we don't add data for nrens with no answers + data.setdefault(nren_name, {question: 0 for question in StaffQuestion})[question] = value + + for nren_name, nren_info in data.items(): + if sum([nren_info[question] for question in StaffQuestion]) == 0: + continue + + staff_type_data = { + 'permanent_fte': nren_info[StaffQuestion.PERMANENT_FTE], + 'subcontracted_fte': nren_info[StaffQuestion.SUBCONTRACTED_FTE], + } + + staff_roles_data = { + 'technical_fte': nren_info[StaffQuestion.TECHNICAL_FTE], + 'non_technical_fte': nren_info[StaffQuestion.NON_TECHNICAL_FTE], + } + + yield ('staff_employment_type', nren_dict[nren_name], nren_dict[nren_name].id, 2022, staff_type_data) + + yield ('staff_roles', nren_dict[nren_name], nren_dict[nren_name].id, 2022, staff_roles_data) + + +def nren_parent_org(nren_dict): + # clean up the data a bit by removing some strings + strings_to_replace = [ + 'We are affiliated to ' + ] + + rows = list(query_question(OrgQuestion.PARENT_ORG_NAME)) + + for row in rows: + nren_name = row[0].upper() + value = str(row[1]).replace('"', '') + + if not value: + continue + + for string in strings_to_replace: + value = value.replace(string, '') + + if nren_name not in nren_dict: + continue + + yield ('parent_organization', nren_dict[nren_name], nren_dict[nren_name].id, 2022, 'Yes') + yield ('parent_organization_name', nren_dict[nren_name], nren_dict[nren_name].id, 2022, value) + + +def nren_sub_org(nren_dict): + suborg_questions = [ + (OrgQuestion.SUB_ORGS_1_NAME, OrgQuestion.SUB_ORGS_1_CHOICE, OrgQuestion.SUB_ORGS_1_ROLE), + (OrgQuestion.SUB_ORGS_2_NAME, OrgQuestion.SUB_ORGS_2_CHOICE, OrgQuestion.SUB_ORGS_2_ROLE), + (OrgQuestion.SUB_ORGS_3_NAME, OrgQuestion.SUB_ORGS_3_CHOICE, OrgQuestion.SUB_ORGS_3_ROLE), + (OrgQuestion.SUB_ORGS_4_NAME, OrgQuestion.SUB_ORGS_4_CHOICE, OrgQuestion.SUB_ORGS_4_ROLE), + (OrgQuestion.SUB_ORGS_5_NAME, OrgQuestion.SUB_ORGS_5_CHOICE, OrgQuestion.SUB_ORGS_5_ROLE) + ] + lookup = defaultdict(list) + + for name, choice, role in suborg_questions: + _name_rows = query_question(name) + _choice_rows = query_question(choice) + _role_rows = list(query_question(role)) + for _name, _choice in zip(_name_rows, _choice_rows): + nren_name = _name[0].upper() + suborg_name = _name[1].replace('"', '').strip() + role_choice = _choice[1].replace('"', '').strip() + + if nren_name not in nren_dict: + continue + + if role_choice.lower() == 'other': + for _role in _role_rows: + if _role[0] == _name[0]: + role = _role[1].replace('"', '').strip() + break + else: + role = role_choice + + if not role: + continue + + lookup[nren_name].append((suborg_name, role)) + + for nren_name, suborgs in lookup.items(): + if suborgs: + yield ('suborganizations', nren_dict[nren_name], nren_dict[nren_name].id, 2022, 'Yes') + + result = [] + for suborg_name, role in suborgs: + result.append({ + 'suborganization_name': suborg_name, + 'suborganization_role': role + }) + + yield ('suborganization_details', nren_dict[nren_name], nren_dict[nren_name].id, 2022, result) + + +def ec_projects(nren_dict): + rows = query_question(ECQuestion.EC_PROJECT) + for row in rows: + nren_name = row[0].upper() + + if nren_name not in nren_dict: + continue + + try: + value = json.loads(row[1]) + except json.decoder.JSONDecodeError: + continue + + has_values = any(value) + + if has_values: + yield ('ec_projects', nren_dict[nren_name], nren_dict[nren_name].id, 2022, "Yes") + + result = [] + + for val in value: + if not val: + continue + + # strip html entities/NBSP from val + val = html.unescape(val).replace('\xa0', ' ') + + # some answers include contract numbers, which we don't want here + val = val.split('(contract n')[0] + result.append(str(val).strip()) + + yield ('ec_project_names', nren_dict[nren_name], nren_dict[nren_name].id, 2022, result) + + +def policies(nren_dict): + """ + Answers are strings that should be urls, but sometimes there's other stuff + like email addresses or random text + """ + + policy_questions = { + 'strategy': {2022: 16469, 2021: 16064, 2020: 15720, 2019: 15305, 2018: 14910}, + 'environment': {2022: 16471, 2021: 16066, 2020: 15722, 2019: 15307, 2018: 14912}, + 'equality': {2022: 16473, 2021: 16378}, + 'connectivity': {2022: 16475, 2021: 16068, 2020: 15724, 2019: 15309, 2018: 14914}, + 'acceptable_use': {2022: 16477, 2021: 16070, 2020: 15726, 2019: 15311, 2018: 14916}, + 'privacy': {2022: 16479, 2021: 16072, 2020: 15728, 2019: 15575}, + 'data_protection': {2022: 16481, 2021: 16074, 2020: 15730, 2019: 15577}, + 'gender': {2022: 16761} + } + + data = {} + for year in [2018, 2019, 2020, 2021, 2022]: + policy_questions_year = {key: years[year] for key, years in policy_questions.items() if year in years} + for question_key, question_id in policy_questions_year.items(): + rows = query_question_id(question_id, year) + for row in rows: + nren_name = row[0].upper() + _value = row[1] + + if nren_name not in nren_dict: + continue + + value = _value.split()[0].strip('"') + + if not value: + continue + + if value.upper() == 'N.A.' or ('.' not in value and '@' not in value): + # this test is a bit silly but does seem to filter out all the nonsense responses + continue + + if _value not in [f'"{value}"', value]: + pass + + # initialize on first use, so we don't add data for nrens with no answers + data.setdefault((nren_name, year), {q: '' for q in policy_questions.keys()}) + data[(nren_name, year)][question_key] = value + + for (nren_name, year), nren_info in data.items(): + + strategy = nren_info['strategy'] + if strategy and not valid_url(strategy): + strategy = '' + environment = nren_info['environment'] + if environment and not valid_url(environment): + environment = '' + equality = nren_info['equality'] + if equality and not valid_url(equality): + equality = '' + connectivity = nren_info['connectivity'] + if connectivity and not valid_url(connectivity): + connectivity = '' + acceptable_use = nren_info['acceptable_use'] + if acceptable_use and not valid_url(acceptable_use): + acceptable_use = '' + privacy = nren_info['privacy'] + if privacy and not valid_url(privacy): + privacy = '' + data_protection = nren_info['data_protection'] + if data_protection and not valid_url(data_protection): + data_protection = '' + gender_equality = nren_info['gender'] + if gender_equality and not valid_url(gender_equality): + gender_equality = '' + + all_policies = [strategy, environment, equality, connectivity, + acceptable_use, privacy, data_protection, gender_equality] + if not any(all_policies): + continue + + if strategy: + yield ('corporate_strategy', nren_dict[nren_name], nren_dict[nren_name].id, year, 'Yes') + yield ('corporate_strategy_url', nren_dict[nren_name], nren_dict[nren_name].id, year, strategy) + + policies = {} + if environment: + policies['environmental_policy'] = { + 'available': ['yes'], + 'url': environment + } + if equality: + policies['equal_opportunity_policy'] = { + 'available': ['yes'], + 'url': equality + } + if connectivity: + policies['connectivity_policy'] = { + 'available': ['yes'], + 'url': connectivity + } + if acceptable_use: + policies['acceptable_use_policy'] = { + 'available': ['yes'], + 'url': acceptable_use + } + if privacy: + policies['privacy_notice'] = { + 'available': ['yes'], + 'url': privacy + } + if data_protection: + policies['data_protection_contact'] = { + 'available': ['yes'], + 'url': data_protection + } + if gender_equality: + policies['gender_equality_policy'] = { + 'available': ['yes'], + 'url': gender_equality + } + + yield ('policies', nren_dict[nren_name], nren_dict[nren_name].id, year, policies) + + +def central_procurement(nren_dict): + + rows = recursive_query(16482) + amounts = recursive_query(16483) + amounts = {(nren_name, year): float(answer.strip('"')) for answer_id, nren_name, year, answer in amounts} + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + yield ('central_software_procurement', nren_dict[nren_name], nren_dict[nren_name].id, year, answer) + + amount = amounts.get((nren_name, year)) + if answer == '"Yes"': + yield ('central_procurement_amount', nren_dict[nren_name], nren_dict[nren_name].id, year, amount) + + +def service_management(nren_dict): + + framework = recursive_query(16484) + framework = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in framework} + targets = recursive_query(16485) + targets = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in targets} + + for nren_name, year in framework.keys() | targets.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + framework_val = framework.get((nren_name, year)) + target_value = targets.get((nren_name, year)) + yield ('formal_service_management_framework', nren, nren.id, year, 'Yes' if framework_val else 'No') + yield ('service_level_targets', nren, nren.id, year, 'Yes' if target_value else 'No') + + +def service_user_types(nren_dict): + + categories = [ + (ServiceCategory.identity, 16488), + (ServiceCategory.network_services, 16489), + (ServiceCategory.collaboration, 16490), + (ServiceCategory.security, 16491), + (ServiceCategory.isp_support, 16492), + (ServiceCategory.storage_and_hosting, 16493), + (ServiceCategory.multimedia, 16494), + (ServiceCategory.professional_services, 16495) + ] + for service_category, question_id in categories: + rows = recursive_query(question_id) + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + service_matrix = defaultdict(dict) + + for user_cat_db in json.loads(answer): + user_cat = UserCategory[SERVICE_USER_TYPE_TO_CODE[user_cat_db]] + service_matrix[user_cat.name].setdefault('service_types', []) + + service_matrix[user_cat.name]['service_types'].append(service_category.name) + + yield ('service_matrix', nren_dict[nren_name], nren_dict[nren_name].id, year, service_matrix) + + +def standards(nren_dict): + + audits = recursive_query(16499) + audits = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in audits} + audit_specifics = recursive_query(16500) + audit_specifics = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in audit_specifics} + bcp = recursive_query(16501) + bcp = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in bcp} + bcp_specifics = recursive_query(16502) + bcp_specifics = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in bcp_specifics} + cmp = recursive_query(16762) + cmp = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in cmp} + + for nren_name, year in audits.keys() | audit_specifics.keys() | bcp.keys() | bcp_specifics.keys() | cmp.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + audit = audits.get((nren_name, year)) + yield ('audits', nren, nren.id, year, 'Yes' if audit else 'No') + + a_specifics = audit_specifics.get((nren_name, year)) + if a_specifics: + yield ('audit_specifics', nren, nren.id, year, a_specifics) + + bcp_val = bcp.get((nren_name, year)) + yield ('business_continuity_plans', nren, nren.id, year, 'Yes' if bcp_val else 'No') + + bcp_s_val = bcp_specifics.get((nren_name, year)) + if bcp_s_val and bcp_val: + yield ('business_continuity_plans_specifics', nren, nren.id, year, bcp_s_val) + + cmp_val = cmp.get((nren_name, year)) + yield ('crisis_management_procedure', nren, nren.id, year, 'Yes' if cmp_val else 'No') + + +def crisis_exercises(nren_dict): + + rows = recursive_query(16763) + + crisis_exercises_map = { + "geant_workshops": "We participate in GEANT Crisis workshops such as CLAW", + "national_excercises": "We participated in National crisis exercises ", + "tabletop_exercises": "We run our own tabletop exercises", + "simulation_excercises": "We run our own simulation exercises", + "other_excercises": "We have done/participated in other exercises or trainings", + "real_crisis": "We had a real crisis", + "internal_security_programme": "We run an internal security awareness programme", + "none": "No, we have not done any crisis exercises or trainings", + } + _reversed_map = {v: k for k, v in crisis_exercises_map.items()} + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + descriptions = list(filter(lambda d: bool(d), [_reversed_map.get(desc) for desc in json.loads(answer)])) + + yield ('crisis_exercises', nren_dict[nren_name], nren_dict[nren_name].id, year, descriptions) + + +def security_controls(nren_dict): + + controls_map = { + "anti_virus": "Anti Virus", + "anti_spam": "Anti-Spam", + "firewall": "Firewall", + "ddos_mitigation": "DDoS mitigation", + "monitoring": "Network monitoring", + "ips_ids": "IPS/IDS", + "acl": "ACL", + "segmentation": "Network segmentation", + "integrity_checking": "Integrity checking" + } + + reversed_map = {v: k for k, v in controls_map.items()} + + sc = recursive_query(16503) + sc = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in sc} + sc_other = recursive_query(16504) + sc_other = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in sc_other} + for key, value in sc_other.items(): + if not isinstance(value, list): + sc_other[key] = [value] + + for nren_name, year in sc.keys() | sc_other.keys(): + if year < 2021: # prior to 2022, the mapping is different, use a different data source + continue + + # TODO: import the pre-2022 data from a handmade CSV. + + if nren_name not in nren_dict: + continue + + full_list = sc.get((nren_name, year), []) + other_entries = [e.strip() for e in sc_other.get((nren_name, year), []) + if e.strip() and e.lower() not in ["n/a", "-"]] + other_entry = ", ".join(other_entries) + if other_entry: + full_list.append(other_entry) + if "Other" in full_list: + full_list.remove("Other") + + full_list = list(filter(lambda d: bool(d), [reversed_map.get(control) for control in full_list])) + + yield ('security_controls', nren_dict[nren_name], nren_dict[nren_name].id, year, full_list) + + +def institutions_urls(nren_dict): + + rows = recursive_query(16507) + + for row in rows: + answer_id, nren_name, year, answer = row + if nren_name not in nren_dict: + continue + + urls = extract_urls(text=answer) + + if not urls: + continue + + valid_urls = [] + + for url in urls: + if not valid_url(url): + continue + valid_urls.append(url) + + if not valid_urls: + continue + + connected_sites = [] + for url in valid_urls: + connected_sites.append({'connected_sites_url': url}) + + yield ('connected_sites_lists', nren_dict[nren_name], nren_dict[nren_name].id, year, connected_sites) + + +def commercial_connectivity(nren_dict): + simple_connection = { + key.replace(" ", "").replace("-", "").replace("/", "").lower(): value for key, value in CONNECTION.items() + } + + def get_coverage(db_string): + cleaned_str = db_string.strip('"').replace(" ", "").replace("-", "").replace("/", "").lower() + key = simple_connection[cleaned_str] + return CommercialConnectivityCoverage[key] + + sp = recursive_query(16646) + sp = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in sp} + collab = recursive_query(16647) + collab = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in collab} + r_e = recursive_query(16648) + r_e = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in r_e} + general = recursive_query(16649) + general = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in general} + spin_off = recursive_query(16650) + spin_off = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in spin_off} + + for nren_name, year in sp.keys() | collab.keys() | r_e.keys() | general.keys() | spin_off.keys(): + if nren_name not in nren_dict: + continue + + commercial_orgs = {} + + commercial_r_e = r_e.get((nren_name, year)) + commercial_general = general.get((nren_name, year)) + commercial_collaboration = collab.get((nren_name, year)) + commercial_service_provider = sp.get((nren_name, year)) + university_spin_off = spin_off.get((nren_name, year)) + + if commercial_r_e: + commercial_orgs['commercial_r_e'] = { + 'connection': commercial_r_e.name + } + if commercial_general: + commercial_orgs['commercial_general'] = { + 'connection': commercial_general.name + } + if commercial_collaboration: + commercial_orgs['commercial_collaboration'] = { + 'connection': commercial_collaboration.name + } + if commercial_service_provider: + commercial_orgs['commercial_service_provider'] = { + 'connection': commercial_service_provider.name + } + if university_spin_off: + commercial_orgs['university_spin_off'] = { + 'connection': university_spin_off.name + } + + yield ('commercial_organizations', nren_dict[nren_name], nren_dict[nren_name].id, year, commercial_orgs) + + +def commercial_charging_level(nren_dict): + + simple_charging = { + key.replace(" ", "").replace("-", "").replace("/", "").lower(): value for key, value in CHARGING_LEVELS.items() + } + simple_charging["nochargesapplied"] = "no_charges_if_r_e_requested" + simple_charging['nochargesappliedifrequestedbyr+eusers\\"needed?'] = "no_charges_if_r_e_requested" + + def get_charging(db_string): + if db_string[0] == '[': + db_string = json.loads(db_string)[0] + cleaned_str = db_string.strip('"').replace(" ", "").replace("-", "").replace("/", "").lower() + key = simple_charging[cleaned_str] + return CommercialCharges[key] + + collab = recursive_query(16652) + collab = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in collab} + services = recursive_query(16653) + services = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in services} + peering = recursive_query(16654) + peering = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in peering} + + for nren_name, year in collab.keys() | services.keys() | peering.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + commercial_charging_levels = {} + + collaboration = collab.get((nren_name, year)) + peer = peering.get((nren_name, year)) + service = services.get((nren_name, year)) + + if collaboration: + commercial_charging_levels['collaboration'] = { + 'charging_level': collaboration.name + } + if peer: + commercial_charging_levels['peering'] = { + 'charging_level': peer.name + } + if service: + commercial_charging_levels['services'] = { + 'charging_level': service.name + } + + yield ('commercial_charging_levels', nren, nren.id, year, commercial_charging_levels) + + +def fibre_light(nren_dict): + comment_map = mapping.VALUE_TO_CODE_MAPPING.get(16668) + + fibre = recursive_query(16668) + fibre = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in fibre} + fibre_comment = recursive_query(16669) + fibre_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in fibre_comment} + + for nren_name, year in fibre.keys() | fibre_comment.keys(): + if nren_name not in nren_dict: + continue + + description = fibre.get((nren_name, year)) + comment = fibre_comment.get((nren_name, year)) + if description and description[0:5] != "Other": + if comment and comment.replace("-", "") != "": + pass # previously used to log comments, just skip them for now + else: + description = comment + + if description: + is_other = description not in comment_map + description = comment_map.get(description, description).replace("\\", "") + if is_other: + yield ('fibre_light', nren_dict[nren_name], nren_dict[nren_name].id, year, "other") + yield ('fibre_light-Comment', nren_dict[nren_name], nren_dict[nren_name].id, year, description) + else: + yield ('fibre_light', nren_dict[nren_name], nren_dict[nren_name].id, year, description) + + +def network_map_urls(nren_dict): + + rows = recursive_query(16670) + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + urls = extract_urls(text=answer) + if not urls: + continue + + network_map_urls = [] + + for url in urls: + network_map_urls.append({'network_map_url': url}) + + yield ('network_map_urls', nren_dict[nren_name], nren_dict[nren_name].id, year, network_map_urls) + + +def monitoring_tools(nren_dict): + description_map = mapping.VALUE_TO_CODE_MAPPING.get(16672) + + tools = recursive_query(16672) + tools = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in tools} + tools_comment = recursive_query(16673) + tools_comment = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in tools_comment} + netflow = recursive_query(16674) + netflow = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in netflow} + + for nren_name, year in tools.keys() | tools_comment.keys() | netflow.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + tool_descriptions = tools.get((nren_name, year), []) + comment = tools_comment.get((nren_name, year), "").replace("-", "") + if comment: + tool_descriptions.append(comment) + if "Other" in tool_descriptions: + tool_descriptions.remove("Other") + if "Other " in tool_descriptions: + tool_descriptions.remove("Other ") + + monitoring_tools = [] + monitoring_tools_comment = "" + other_tools = [] + + for description in tool_descriptions: + if description in description_map: + monitoring_tools.append(description_map[description]) + else: + other_tools.append(description) + + if other_tools: + monitoring_tools.append("other") + monitoring_tools_comment = ", ".join(other_tools) + + yield ('monitoring_tools', nren, nren.id, year, monitoring_tools) + + if monitoring_tools_comment: + yield ('monitoring_tools-Comment', nren, nren.id, year, monitoring_tools_comment) + + # netflow processing description + if netflow.get((nren_name, year)): + yield ('netflow_vendors', nren, nren.id, year, netflow.get((nren_name, year))) + + +def traffic_statistics(nren_dict): + + stats = recursive_query(16677) + stat_urls = recursive_query(16678) + stat_urls = {(nren_name, year): answer for answer_id, nren_name, year, answer in stat_urls} + + for answer_id, nren_name, year, answer in stats: + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + db_urls = stat_urls.get((nren_name, year)) + if db_urls: + urls = extract_urls(text=db_urls) + db_urls = urls + else: + db_urls = [] + + valid_urls = [] + for url in db_urls: + if valid_url(url): + valid_urls.append(url) + + if not valid_urls: + continue + + yield ('traffic_statistics', nren, nren.id, year, "Yes") + yield ('traffic_statistics_urls', nren, nren.id, year, [{"traffic_statistics_url": url} for url in valid_urls]) + + +def siem_vendors(nren_dict): + + vendors = recursive_query(16679) + vendors = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in vendors} + vendor_comment = recursive_query(16680) + vendor_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in vendor_comment} + + for nren_name, year in vendors.keys() | vendor_comment.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + vendor_names = vendors.get((nren_name, year), []) + comment = vendor_comment.get((nren_name, year)) + if comment: + vendor_names.remove("Other") + + siem_soc_vendor = [*vendor_names] + siem_soc_vendor_comment = "" + if comment: + siem_soc_vendor.append("other") + siem_soc_vendor_comment = comment + + yield ('siem_soc_vendor', nren, nren.id, year, siem_soc_vendor) + if siem_soc_vendor_comment: + yield ('siem_soc_vendor-Comment', nren, nren.id, year, siem_soc_vendor_comment) + + +def certificate_providers(nren_dict): + + provider_map = mapping.VALUE_TO_CODE_MAPPING.get(16681) + + providers = recursive_query(16681) + providers = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in providers} + prov_comment = recursive_query(16682) + prov_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in prov_comment} + + for nren_name, year in providers.keys() | prov_comment.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + provider_names = providers.get((nren_name, year), []) + comment = prov_comment.get((nren_name, year)) + if comment: + provider_names.append(comment) + if "Other" in provider_names: + provider_names.remove("Other") + + def _replace_provider(provider): + if 'let' in provider.lower() and 'encrypt' in provider.lower(): + return "Let's Encrypt" + return provider_map.get(provider, provider) + + provider_names = [_replace_provider(p) for p in provider_names] + + certificate_service = [] + certificate_service_comment = [] + + for provider in provider_names: + if provider in provider_map: + certificate_service.append(provider) + else: + if "other" not in certificate_service: + certificate_service.append("other") + certificate_service_comment.append(provider) + + yield ('certificate_service', nren, nren.id, year, certificate_service) + if certificate_service_comment: + yield ('certificate_service-Comment', nren, nren.id, year, ", ".join(certificate_service_comment)) + + +def weather_map(nren_dict): + + weather = recursive_query(16683) + urls = recursive_query(16684) + urls = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in urls} + + for answer_id, nren_name, year, answer in weather: + if nren_name not in nren_dict: + continue + + url = urls.get((nren_name, year), "") + if url: + found_urls = extract_urls(text=url) + if found_urls: + url = found_urls[0] + else: + url = "" + + valid = valid_url(url) + + if not valid: + continue + + yield ('network_weather', nren_dict[nren_name], nren_dict[nren_name].id, year, "Yes") + yield ('network_weather_url', nren_dict[nren_name], nren_dict[nren_name].id, year, url) + + +def pert_team(nren_dict): + + rows = recursive_query(16685) + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + if answer == "null": + continue + pert = YesNoPlanned[answer.strip('"').lower()] + + pert = pert.name[0].upper() + pert.name[1:] + + yield ('pert_team', nren_dict[nren_name], nren_dict[nren_name].id, year, pert) + + +def alien_wave(nren_dict): + + alien = recursive_query(16687) + alien = { + (nren_name, year): YesNoPlanned[answer.strip('"').lower()] for answer_id, nren_name, year, answer in alien + } + nr = recursive_query(16688) + nr = {(nren_name, year): int(answer.strip('"')) for answer_id, nren_name, year, answer in nr} + internal = recursive_query(16689) + internal = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in internal} + + for nren_name, year in alien.keys() | nr.keys() | internal.keys(): + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + alien_wave_third_party = alien.get((nren_name, year)) + nr_of_alien_wave_third_party_services = nr.get((nren_name, year)) + alien_wave_internal = internal.get((nren_name, year)) + + if alien_wave_third_party or nr_of_alien_wave_third_party_services: + name = (alien_wave_third_party or YesNoPlanned.yes).name # default to yes if there's services, but no flag + name = name[0].upper() + name[1:] + yield ('alienwave_services', nren, nren.id, year, name) + + if nr_of_alien_wave_third_party_services: + yield ('alienwave_services_number', nren, nren.id, year, nr_of_alien_wave_third_party_services) + + if alien_wave_internal: + yield ('alienwave_internal', nren, nren.id, year, "Yes") + + +def external_connections(nren_dict): + + question_nrs = { + 16694: (5, "capacity"), + 16695: (7, "capacity"), + 16696: (6, "capacity"), + 16697: (7, "from_organization"), + 16698: (1, "to_organization"), + 16699: (8, "to_organization"), + 16700: (9, "to_organization"), + 16701: (1, "from_organization"), + 16702: (8, "capacity"), + 16703: (5, "to_organization"), + 16704: (0, "link_name"), + 16705: (1, "link_name"), + 16706: (9, "capacity"), + 16707: (2, "link_name"), + 16708: (0, "from_organization"), + 16709: (4, "link_name"), + 16710: (3, "link_name"), + 16711: (9, "link_name"), + 16712: (7, "link_name"), + 16713: (8, "link_name"), + 16714: (6, "link_name"), + 16715: (5, "link_name"), + 16716: (4, "from_organization"), + 16717: (5, "from_organization"), + 16718: (6, "from_organization"), + 16719: (2, "to_organization"), + 16720: (3, "to_organization"), + 16721: (4, "to_organization"), + 16722: (6, "to_organization"), + 16723: (7, "to_organization"), + 16724: (2, "interconnection_method"), + 16725: (3, "interconnection_method"), + 16726: (4, "interconnection_method"), + 16727: (5, "interconnection_method"), + 16728: (8, "from_organization"), + 16729: (9, "from_organization"), + 16730: (0, "to_organization"), + 16731: (0, "capacity"), + 16732: (1, "capacity"), + 16733: (2, "capacity"), + 16734: (3, "capacity"), + 16735: (4, "capacity"), + 16736: (3, "from_organization"), + 16737: (2, "from_organization"), + 16738: (1, "interconnection_method"), + 16739: (7, "interconnection_method"), + 16740: (8, "interconnection_method"), + 16741: (0, "interconnection_method"), + 16742: (9, "interconnection_method"), + 16743: (6, "interconnection_method") + } + + connection_dicts = {} + nren_year_set = set() + for question_id, (connection_nr, field) in question_nrs.items(): + rows = recursive_query(question_id) + for answer_id, nren_name, year, answer in rows: + nren_year_set.add((nren_name, year)) + conn_dict = connection_dicts.setdefault((nren_name, year, connection_nr), {}) + conn_dict[field] = answer.strip('" ') + + int_simple = {key.replace(" ", "").lower(): value for key, value in INTERCONNECTION.items()} + int_simple['openexchangepoi'] = "open_exchange" + + for conn_dict in connection_dicts.values(): + if conn_dict.get('capacity'): + try: + conn_dict['capacity'] = str(Decimal(conn_dict['capacity'].split('G')[0].strip())) + except: # noqa: E722 + # Capacity could not be converted to a number + conn_dict['capacity'] = None + if conn_dict.get('interconnection_method'): + int_conn = int_simple[conn_dict['interconnection_method'].replace(" ", "").lower()] + conn_dict['interconnection_method'] = ConnectionMethod[int_conn].name + + for nren_name, year in nren_year_set: + if nren_name not in nren_dict: + continue + + connections = [] + for connection_nr in range(0, 10): + conn = connection_dicts.get((nren_name, year, connection_nr)) + if conn: + connections.append(conn) + + yield ('external_connections', nren_dict[nren_name], nren_dict[nren_name].id, year, connections) + + +def network_automation(nren_dict): + + network_automation_map = mapping.VALUE_TO_CODE_MAPPING.get(16758) + + rows = recursive_query(16757) + tasks = recursive_query(16758) + tasks = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in tasks} + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + network_automation = YesNoPlanned[answer.strip('"').lower()] + specifics = tasks.get((nren_name, year), []) + + network_automation_tasks = [] + + for task in specifics: + if task in network_automation_map: + # we don't allow "other" in the tasks in the survey, so only map the known tasks + network_automation_tasks.append(network_automation_map[task]) + name = network_automation.name + name = name[0].upper() + name[1:] + yield ('network_automation', nren, nren.id, year, name) + + yield ('network_automation_tasks', nren, nren.id, year, network_automation_tasks) + + +def network_function_virtualisation(nren_dict): + + nfv_map = mapping.VALUE_TO_CODE_MAPPING.get(16755) + rows = recursive_query(16754) + types = recursive_query(16755) + types = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in types} + types_comment = recursive_query(16756) + types_comment = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in types_comment} + + for answer_id, nren_name, year, answer in rows: + if nren_name not in nren_dict: + continue + + nren = nren_dict[nren_name] + + nfv = YesNoPlanned[answer.strip('"').lower()] + nfv = nfv.name[0].upper() + nfv.name[1:] + specifics = types.get((nren_name, year), []) + specifics = list(itertools.chain(*[s.split(', ') for s in specifics if s])) + comment = types_comment.get((nren_name, year), "").replace("-", "") + if comment: + specifics.append(comment) + if "Other" in specifics: + specifics.remove("Other") + + nfv_types = [] + nfv_types_comment = [] + for task in specifics: + if task in nfv_map: + nfv_types.append(nfv_map.get(task)) + else: + nfv_types_comment.append(task) + if "other" not in nfv_types: + nfv_types.append("other") + + yield ('nfv', nren, nren.id, year, nfv) + + if nfv_types: + yield ('nfv_types', nren, nren.id, year, nfv_types) + + if nfv_types_comment: + yield ('nfv_types-Comment', nren, nren.id, year, ", ".join(nfv_types_comment)) + + +def fetch_data(): + # requires being in a flask app context when called + nren_dict = helpers.get_uppercase_nren_dict() + yield from budget(nren_dict) + yield from funding_sources(nren_dict) + yield from staff_data(nren_dict) + yield from nren_parent_org(nren_dict) + yield from nren_sub_org(nren_dict) + yield from charging_structure(nren_dict) + yield from ec_projects(nren_dict) + yield from policies(nren_dict) + yield from institutions_urls(nren_dict) + + yield from central_procurement(nren_dict) + yield from service_management(nren_dict) + yield from service_user_types(nren_dict) + yield from standards(nren_dict) + yield from crisis_exercises(nren_dict) + yield from security_controls(nren_dict) + yield from commercial_connectivity(nren_dict) + yield from commercial_charging_level(nren_dict) + + yield from fibre_light(nren_dict) + yield from network_map_urls(nren_dict) + yield from monitoring_tools(nren_dict) + yield from traffic_statistics(nren_dict) + yield from siem_vendors(nren_dict) + yield from certificate_providers(nren_dict) + yield from weather_map(nren_dict) + yield from pert_team(nren_dict) + yield from alien_wave(nren_dict) + yield from external_connections(nren_dict) + yield from network_function_virtualisation(nren_dict) + yield from network_automation(nren_dict) diff --git a/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_excel.py b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_excel.py new file mode 100644 index 0000000000000000000000000000000000000000..62cf976ab7cfee6ad6b2e308557a4ac4829b19d4 --- /dev/null +++ b/compendium_v2/publishers/legacy_publisher/survey_publisher_legacy_excel.py @@ -0,0 +1,525 @@ +""" +survey_publisher_v1 +========================= + +This module loads the survey data from before 2022 from a legacy Excel files. +Missing info is filled in from the survey db for some questions. +Registered as click cli command when installing compendium-v2. + +""" +from __future__ import annotations +import itertools +from sqlalchemy import select +from collections import defaultdict + +from compendium_v2.db import db +from compendium_v2.publishers import helpers, excel_parser +from compendium_v2.survey_db import model as survey_model + + +def budget(nren_dict): + nren_by_id = {nren.id: nren for nren in nren_dict.values()} + data = db.session.scalars(select(survey_model.Nrens)) + inserts = defaultdict(dict) + for nren in data: + for budget in nren.budgets: + abbrev = nren.abbreviation.upper() + year = budget.year + + if abbrev not in nren_dict: + continue + + budget_entry = { + 'nren': nren_dict[abbrev], + 'nren_id': nren_dict[abbrev].id, + 'budget': float(budget.budget), + 'year': year + } + inserts[nren_dict[abbrev].id][year] = budget_entry + + # Import the data from excel sheet to database + exceldata = excel_parser.fetch_budget_excel_data() + + for abbrev, budget, year in exceldata: + if abbrev not in nren_dict: + continue + + budget_entry = { + 'nren': nren_dict[abbrev], + 'nren_id': nren_dict[abbrev].id, + 'budget': budget, + 'year': year + } + inserts[nren_dict[abbrev].id][year] = budget_entry + + for nren_id, year_data in inserts.items(): + for year, budget_entry in year_data.items(): + nren = nren_by_id[nren_id] + yield ('budget', nren, nren.id, year, budget_entry['budget']) + + +def funding(nren_dict): + data = excel_parser.fetch_funding_excel_data() + for (abbrev, year, client_institution, + european_funding, + gov_public_bodies, + commercial, other) in data: + + if abbrev not in nren_dict: + continue + + _data = { + 'client_institutions': client_institution, + 'european_funding': european_funding, + 'commercial': commercial, + 'other': other, + 'gov_public_bodies': gov_public_bodies, + } + + yield ('income_sources', nren_dict[abbrev], nren_dict[abbrev].id, year, _data) + + +def charging_structure(nren_dict): + data = excel_parser.fetch_charging_structure_excel_data() + + for (abbrev, year, charging_structure) in data: + if abbrev not in nren_dict: + continue + + if charging_structure: + charging_structure = charging_structure.name + + yield ('charging_mechanism', nren_dict[abbrev], nren_dict[abbrev].id, year, charging_structure) + + +def staffing(nren_dict): + staff_data = list(excel_parser.fetch_staffing_excel_data()) + + nren_staff_map = {} + for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + nren_staff_map[(nren.id, year)] = { + 'nren': nren, + 'nren_id': nren.id, + 'year': year, + 'permanent_fte': permanent_fte, + 'subcontracted_fte': subcontracted_fte, + 'technical_fte': 0, + 'non_technical_fte': 0 + } + + function_data = excel_parser.fetch_staff_function_excel_data() + for (abbrev, year, technical_fte, non_technical_fte) in function_data: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + if (nren.id, year) in nren_staff_map: + nren_staff_map[(nren.id, year)]['technical_fte'] = technical_fte + nren_staff_map[(nren.id, year)]['non_technical_fte'] = non_technical_fte + else: + nren_staff_map[(nren.id, year)] = { + 'nren': nren, + 'nren_id': nren.id, + 'year': year, + 'permanent_fte': 0, + 'subcontracted_fte': 0, + 'technical_fte': technical_fte, + 'non_technical_fte': non_technical_fte + } + + for nren_staff_model in nren_staff_map.values(): + + nren = nren_staff_model['nren'] + year = nren_staff_model['year'] + staff_type_data = { + 'permanent_fte': nren_staff_model['permanent_fte'], + 'subcontracted_fte': nren_staff_model['subcontracted_fte'], + } + + staff_roles_data = { + 'technical_fte': nren_staff_model['technical_fte'], + 'non_technical_fte': nren_staff_model['non_technical_fte'], + } + + yield ('staff_employment_type', nren, nren.id, year, staff_type_data) + + yield ('staff_roles', nren, nren.id, year, staff_roles_data) + + +def ecprojects(nren_dict): + ecproject_data = excel_parser.fetch_ecproject_excel_data() + + by_nren_year = defaultdict(lambda: defaultdict(set)) + for (abbrev, year, project) in ecproject_data: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + + by_nren_year[abbrev][year].add(project) + + for abbrev, year_projects in by_nren_year.items(): + nren = nren_dict[abbrev] + for year, projects in year_projects.items(): + yield ('ec_project_names', nren, nren.id, year, list(projects)) + + +def parent_organizations(nren_dict): + organization_data = excel_parser.fetch_organization_excel_data() + for (abbrev, year, org) in organization_data: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + yield ('parent_organization', nren, nren.id, year, 'Yes') + yield ('parent_organization_name', nren, nren.id, year, org) + + +def traffic_volume(nren_dict): + traffic_data = excel_parser.fetch_traffic_excel_data() + for (abbrev, year, from_external, to_external, from_customers, to_customers) in traffic_data: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + if nren.name == 'CESNET': + # COMP-447: correct CESNET traffic data for 2019 + if year == 2019: + to_customers = 222766 + + yield ('traffic_estimate', nren, nren.id, year, { + 'from_customers': from_customers, + 'from_external': from_external, + 'to_customers': to_customers, + 'to_external': to_external + }) + + +def connected_proportion(nren_dict): + remit = excel_parser.fetch_remit_excel_data() + nr_connected = excel_parser.fetch_nr_connected_excel_data() + market_share = excel_parser.fetch_market_share_excel_data() + users_served = excel_parser.fetch_users_served_excel_data() + + data_by_nren = defaultdict(lambda: defaultdict(dict)) + + for key in itertools.chain(remit.keys(), nr_connected.keys(), market_share.keys(), users_served.keys()): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + continue + + covered = remit.get(key) + if covered: + covered = covered.name + market_share_percentage = market_share.get(key) + _nr_connected = nr_connected.get(key) + nr_of_users = users_served.get(key) + + result = {} + if covered: + result['covered'] = covered + if market_share_percentage: + result['market_share_percentage'] = market_share_percentage + if _nr_connected: + result['nr_connected'] = _nr_connected + if nr_of_users: + result['nr_of_users'] = nr_of_users + + existing = data_by_nren[abbrev][year].get(user_category) + if existing: + existing.update(result) + else: + data_by_nren[abbrev][year][user_category] = result + + for abbrev, data in data_by_nren.items(): + for year, category_data in data.items(): + connectivity_proportions = {} + for user_category, user_data in category_data.items(): + connectivity_proportions[user_category.name] = user_data + + yield ('connectivity_proportions', nren_dict[abbrev], nren_dict[abbrev].id, year, connectivity_proportions) + + +def connectivity_level(nren_dict): + typical_speeds = excel_parser.fetch_typical_speed_excel_data() + highest_speeds = excel_parser.fetch_highest_speed_excel_data() + highest_speed_proportions = excel_parser.fetch_highest_speed_proportion_excel_data() + data_by_nren = defaultdict(lambda: defaultdict(dict)) + + for key in itertools.chain(typical_speeds.keys(), highest_speeds.keys(), highest_speed_proportions.keys()): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + continue + + typical_speed = typical_speeds.get(key) + highest_speed = highest_speeds.get(key) + highest_speed_connection_percentage = highest_speed_proportions.get(key) + + result = {} + if typical_speed: + result['typical_speed'] = typical_speed + if highest_speed: + result['highest_speed'] = highest_speed + if highest_speed_connection_percentage: + result['highest_speed_connection_percentage'] = highest_speed_connection_percentage + + existing = data_by_nren[abbrev][year].get(user_category) + if existing: + existing.update(result) + else: + data_by_nren[abbrev][year][user_category] = result + + for abbrev, data in data_by_nren.items(): + for year, category_data in data.items(): + connectivity_level = {} + for user_category, user_data in category_data.items(): + connectivity_level[user_category.name] = user_data + yield ('connectivity_level', nren_dict[abbrev], nren_dict[abbrev].id, year, connectivity_level) + + +def connection_carrier(nren_dict): + carriers = excel_parser.fetch_carriers_excel_data() + data_by_nren = defaultdict(lambda: defaultdict(dict)) + + for key, carry_mechanism in carriers.items(): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + continue + + result = {} + if carry_mechanism: + result['carry_mechanism'] = carry_mechanism.name + + existing = data_by_nren[abbrev][year].get(user_category) + if existing: + existing.update(result) + else: + data_by_nren[abbrev][year][user_category] = result + + for abbrev, data in data_by_nren.items(): + for year, category_data in data.items(): + traffic_carriers = {} + for user_category, user_data in category_data.items(): + traffic_carriers[user_category.name] = user_data + yield ('traffic_carriers', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_carriers) + + +def connectivity_growth(nren_dict): + growth = excel_parser.fetch_growth_excel_data() + data_by_nren = defaultdict(lambda: defaultdict(dict)) + for key, growth_percent in growth.items(): + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + continue + + result = {} + if growth_percent: + result['growth_rate'] = growth_percent + + existing = data_by_nren[abbrev][year].get(user_category) + if existing: + existing.update(result) + else: + data_by_nren[abbrev][year][user_category] = result + + for abbrev, data in data_by_nren.items(): + for year, category_data in data.items(): + traffic_growth = {} + for user_category, user_data in category_data.items(): + traffic_growth[user_category.name] = user_data + yield ('traffic_growth', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_growth) + + +def connectivity_load(nren_dict): + averages = excel_parser.fetch_average_traffic_excel_data() + peaks = excel_parser.fetch_peak_traffic_excel_data() + + all_entry_keys = set() + all_entry_keys.update(averages.keys()) + all_entry_keys.update(peaks.keys()) + data_by_nren = defaultdict(lambda: defaultdict(dict)) + for key in all_entry_keys: + (abbrev, year, user_category) = key + if abbrev not in nren_dict: + continue + + result = {} + average = averages.get(key, (None, None)) + peak = peaks.get(key, (None, None)) + average_from_institutions_to_network = average[0] + average_to_institutions_from_network = average[1] + peak_from_institutions_to_network = peak[0] + peak_to_institutions_from_network = peak[1] + + if average_from_institutions_to_network: + result['average_from_institutions_to_network'] = average_from_institutions_to_network + if average_to_institutions_from_network: + result['average_to_institutions_from_network'] = average_to_institutions_from_network + if peak_from_institutions_to_network: + result['peak_from_institutions_to_network'] = peak_from_institutions_to_network + if peak_to_institutions_from_network: + result['peak_to_institutions_from_network'] = peak_to_institutions_from_network + + existing = data_by_nren[abbrev][year].get(user_category) + if existing: + existing.update(result) + else: + data_by_nren[abbrev][year][user_category] = result + + for abbrev, data in data_by_nren.items(): + for year, category_data in data.items(): + traffic_load = {} + for user_category, user_data in category_data.items(): + traffic_load[user_category.name] = user_data + yield ('traffic_load', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_load) + + +def remote_campuses(nren_dict): + campuses = excel_parser.fetch_remote_campuses_excel_data() + + for (abbrev, year, connectivity, country, connected_to_r_e) in campuses: + if abbrev not in nren_dict: + continue + + if connectivity or country: + yield ('remote_campuses', nren_dict[abbrev], nren_dict[abbrev].id, year, "Yes") + if country: + yield ('remote_campuses_specifics', nren_dict[abbrev], nren_dict[abbrev].id, year, [ + {'country': country, 'connected': connected_to_r_e} + ]) + + +def dark_fibre_lease(nren_dict): + data_rows = excel_parser.fetch_dark_fibre_iru_excel_data() + iru_durations = excel_parser.fetch_iru_duration_excel_data() + + for (abbrev, year, iru, length_in_country, length_out_country) in data_rows: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + + dark_fibre_lease = iru + dark_fibre_lease_duration = iru_durations.get((abbrev, year)) + + if dark_fibre_lease or dark_fibre_lease_duration or length_in_country or length_out_country: + yield ('dark_fibre_lease', nren, nren.id, year, "Yes") + else: + return + + if dark_fibre_lease_duration: + yield ('dark_fibre_lease_duration', nren, nren.id, year, dark_fibre_lease_duration) + if length_in_country: + yield ('dark_fibre_lease_kilometers_inside_country', nren, nren.id, year, length_in_country) + if length_out_country: + yield ('dark_fibre_lease_kilometers_outside_country', nren, nren.id, year, length_out_country) + + +def dark_fibre_installed(nren_dict): + data_rows = excel_parser.fetch_dark_fibre_installed_excel_data() + for (abbrev, year, installed, length) in data_rows: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + + if installed or length: + yield ('dark_fibre_nren', nren, nren.id, year, "Yes") + if length: + yield ('dark_fibre_nren_kilometers_inside_country', nren, nren.id, year, length) + + +def passive_monitoring(nren_dict): + data_rows = excel_parser.fetch_passive_monitoring_excel_data() + for (abbrev, year, monitoring, method) in data_rows: + if abbrev not in nren_dict: + continue + + if monitoring or method: + yield ('passive_monitoring', nren_dict[abbrev], nren_dict[abbrev].id, year, "Yes") + if method: + yield ('passive_monitoring_tech', nren_dict[abbrev], nren_dict[abbrev].id, year, method.name) + + +def capacity(nren_dict): + largest_data_rows = excel_parser.fetch_largest_link_capacity_excel_data() + typical_data_rows = excel_parser.fetch_typical_backbone_capacity_excel_data() + + by_nren = defaultdict(dict) + + for key in itertools.chain(largest_data_rows.keys(), typical_data_rows.keys()): + (abbrev, year) = key + if abbrev not in nren_dict: + continue + + to_add = (abbrev, year) + by_nren[to_add].update({ + 'max_capacity': largest_data_rows.get(key, by_nren[to_add].get('max_capacity')), + 'typical_capacity': typical_data_rows.get(key, by_nren[to_add].get('typical_capacity')) + }) + + for (abbrev, year), data in by_nren.items(): + max_capacity = data.get('max_capacity') + typical_capacity = data.get('typical_capacity') + + if max_capacity: + yield ('max_capacity', nren_dict[abbrev], nren_dict[abbrev].id, year, max_capacity) + if typical_capacity: + yield ('typical_capacity', nren_dict[abbrev], nren_dict[abbrev].id, year, typical_capacity) + + +def non_r_e_peers(nren_dict): + data_rows = excel_parser.fetch_non_r_e_peers_excel_data() + for (abbrev, year, nr_of_non_r_and_e_peers) in data_rows: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + + if nr_of_non_r_and_e_peers: + yield ('non_r_and_e_peers', nren, nren.id, year, nr_of_non_r_and_e_peers) + + +def ops_automation(nren_dict): + data_rows = excel_parser.fetch_ops_automation_excel_data() + for (abbrev, year, automation, specifics) in data_rows: + if abbrev not in nren_dict: + continue + + nren = nren_dict[abbrev] + + if automation or specifics: + yield ('operational_process_automation', nren, nren.id, year, "Yes") + + if specifics: + yield ('operational_process_automation_tools', nren, nren.id, year, specifics) + + +def fetch_data(): + # requires being in a flask app context when called + nren_dict = helpers.get_uppercase_nren_dict() + yield from budget(nren_dict) + yield from funding(nren_dict) + yield from charging_structure(nren_dict) + yield from staffing(nren_dict) + yield from ecprojects(nren_dict) + yield from parent_organizations(nren_dict) + yield from traffic_volume(nren_dict) + + yield from connected_proportion(nren_dict) + yield from connectivity_level(nren_dict) + yield from connection_carrier(nren_dict) + yield from connectivity_growth(nren_dict) + yield from connectivity_load(nren_dict) + yield from remote_campuses(nren_dict) + + yield from dark_fibre_lease(nren_dict) + yield from dark_fibre_installed(nren_dict) + yield from passive_monitoring(nren_dict) + yield from capacity(nren_dict) + yield from non_r_e_peers(nren_dict) + yield from ops_automation(nren_dict) diff --git a/setup.py b/setup.py index ac1a5915f1aa7d02de9c5ddae44a2295b5f85c9c..2a2dd12810d069f2799930414ec50ed828eaa7a9 100644 --- a/setup.py +++ b/setup.py @@ -30,10 +30,11 @@ setup( include_package_data=True, entry_points={ 'console_scripts': [ - 'excel-survey-publisher=compendium_v2.publishers.survey_publisher_legacy_excel:cli', # noqa - 'db-publisher-2022=compendium_v2.publishers.survey_publisher_old_db_2022:cli', # noqa - 'conversion=compendium_v2.conversion.conversion:cli', # noqa - 'dump_survey_model=compendium_v2.migrations.dump_survey_model:cli', # noqa + 'excel-survey-publisher=compendium_v2.publishers.survey_publisher_legacy_excel:cli', + 'db-publisher-2022=compendium_v2.publishers.survey_publisher_old_db_2022:cli', + 'conversion=compendium_v2.conversion.conversion:cli', + 'dump_survey_model=compendium_v2.migrations.dump_survey_model:cli', + 'legacy-survey-publisher=compendium_v2.publishers.legacy_publisher.survey_publisher_legacy:cli', ] }, license='MIT',