Skip to content
Snippets Groups Projects
Commit 4f334d0d authored by Bjarke Madsen's avatar Bjarke Madsen
Browse files

Add back in data publishers until we clean the survey data

parent ea2254dd
No related branches found
No related tags found
No related merge requests found
"""
survey_publisher_v1
=========================
This module loads the survey data from before 2022 from a legacy Excel files.
Missing info is filled in from the survey db for some questions.
Registered as click cli command when installing compendium-v2.
"""
from __future__ import annotations
import itertools
import logging
import math
import click
from sqlalchemy import select, delete
from collections import defaultdict
import compendium_v2
from compendium_v2.config import load
from compendium_v2.db import db, presentation_models
from compendium_v2.environment import setup_logging
from compendium_v2.publishers import helpers, excel_parser
from compendium_v2.survey_db import model as survey_model
setup_logging()
logger = logging.getLogger('survey-publisher-legacy-excel')
def db_budget_migration(nren_dict):
# move data from Survey DB budget table
data = db.session.scalars(select(survey_model.Nrens))
db.session.execute(delete(presentation_models.BudgetEntry).where(
presentation_models.BudgetEntry.year < 2022))
inserts = defaultdict(dict)
for nren in data:
for budget in nren.budgets:
abbrev = nren.abbreviation.upper()
year = budget.year
if float(budget.budget) > 400:
logger.warning(f'Incorrect Data: {abbrev} has budget set >400M EUR for {year}. ({budget.budget})')
continue
if float(budget.budget) == 0:
logger.warning(f'Incorrect Data: {abbrev} has budget set to 0 EUR for {year}.')
continue
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
budget_entry = {
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'budget': float(budget.budget),
'year': year
}
inserts[nren_dict[abbrev].id][year] = budget_entry
# Import the data from excel sheet to database
exceldata = excel_parser.fetch_budget_excel_data()
for abbrev, budget, year in exceldata:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
if budget > 400:
logger.warning(f'{nren} has budget set to >400M EUR for {year}. ({budget})')
continue
budget_entry = {
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'budget': budget,
'year': year
}
inserts[nren_dict[abbrev].id][year] = budget_entry
all_inserts = list(itertools.chain([i for y in inserts.values() for i in y.values()]))
db.session.bulk_insert_mappings(presentation_models.BudgetEntry, all_inserts)
db.session.commit()
def db_funding_migration(nren_dict):
# Import the data to database
data = excel_parser.fetch_funding_excel_data()
db.session.execute(delete(presentation_models.FundingSource).where(
presentation_models.FundingSource.year < 2022))
inserts = []
for (abbrev, year, client_institution,
european_funding,
gov_public_bodies,
commercial, other) in data:
_data = [client_institution, european_funding, gov_public_bodies, commercial, other]
total = sum(_data)
if not math.isclose(total, 100, abs_tol=0.01):
logger.warning(f'{abbrev} funding sources for {year} do not sum to 100% ({total})')
continue
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
inserts.append({
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'year': year,
'client_institutions': client_institution,
'european_funding': european_funding,
'gov_public_bodies': gov_public_bodies,
'commercial': commercial,
'other': other
})
db.session.bulk_insert_mappings(presentation_models.FundingSource, inserts)
db.session.commit()
def db_charging_structure_migration(nren_dict):
# Import the data to database
data = excel_parser.fetch_charging_structure_excel_data()
db.session.execute(delete(presentation_models.ChargingStructure).where(
presentation_models.ChargingStructure.year < 2022))
inserts = []
for (abbrev, year, charging_structure) in data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
inserts.append({
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'year': year,
'fee_type': charging_structure
})
db.session.bulk_insert_mappings(presentation_models.ChargingStructure, inserts)
db.session.commit()
def db_staffing_migration(nren_dict):
db.session.execute(delete(presentation_models.NrenStaff).where(
presentation_models.NrenStaff.year < 2022))
staff_data = list(excel_parser.fetch_staffing_excel_data())
nren_staff_map = {}
inserts = []
for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping staff data.')
continue
nren = nren_dict[abbrev]
nren_staff_map[(nren.id, year)] = {
'nren': nren,
'nren_id': nren.id,
'year': year,
'permanent_fte': permanent_fte,
'subcontracted_fte': subcontracted_fte,
'technical_fte': 0,
'non_technical_fte': 0
}
function_data = excel_parser.fetch_staff_function_excel_data()
for (abbrev, year, technical_fte, non_technical_fte) in function_data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping staff function data.')
continue
nren = nren_dict[abbrev]
if (nren.id, year) in nren_staff_map:
nren_staff_map[(nren.id, year)]['technical_fte'] = technical_fte
nren_staff_map[(nren.id, year)]['non_technical_fte'] = non_technical_fte
else:
nren_staff_map[(nren.id, year)] = {
'nren': nren,
'nren_id': nren.id,
'year': year,
'permanent_fte': 0,
'subcontracted_fte': 0,
'technical_fte': technical_fte,
'non_technical_fte': non_technical_fte
}
for nren_staff_model in nren_staff_map.values():
employed = nren_staff_model['permanent_fte'] + nren_staff_model['subcontracted_fte']
technical = nren_staff_model['technical_fte'] + nren_staff_model['non_technical_fte']
if not math.isclose(employed, technical, abs_tol=0.01) and employed != 0 and technical != 0:
logger.warning(f'{nren_staff_model["nren"].name} in {nren_staff_model["year"]}:'
f' FTE do not equal across employed/technical categories ({employed} != {technical})')
del nren_staff_model['nren']
inserts.append(nren_staff_model)
db.session.bulk_insert_mappings(presentation_models.NrenStaff, inserts)
db.session.commit()
def db_ecprojects_migration(nren_dict):
db.session.execute(delete(presentation_models.ECProject).where(
presentation_models.ECProject.year < 2022))
ecproject_data = excel_parser.fetch_ecproject_excel_data()
inserted = set()
inserts = []
for (abbrev, year, project) in ecproject_data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
if (nren.id, year, project) in inserted:
logger.warning(f'{nren.name} has duplicate EC project {project} in {year}. Skipping.')
continue
inserted.add((nren.id, year, project))
inserts.append({
'nren_id': nren.id,
'year': year,
'project': project
})
db.session.bulk_insert_mappings(presentation_models.ECProject, inserts)
db.session.commit()
def db_organizations_migration(nren_dict):
db.session.execute(delete(presentation_models.ParentOrganization).where(
presentation_models.ParentOrganization.year < 2022))
organization_data = excel_parser.fetch_organization_excel_data()
inserts = []
for (abbrev, year, org) in organization_data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'organization': org
})
db.session.bulk_insert_mappings(presentation_models.ParentOrganization, inserts)
db.session.commit()
def db_traffic_volume_migration(nren_dict):
db.session.execute(delete(presentation_models.TrafficVolume).where(
presentation_models.TrafficVolume.year < 2023))
traffic_data = excel_parser.fetch_traffic_excel_data()
inserts = []
for (abbrev, year, from_external, to_external, from_customers, to_customers) in traffic_data:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
if nren.name == 'CESNET':
# COMP-447: correct CESNET traffic data for 2019
if year == 2019:
to_customers = 222766
inserts.append({
'nren_id': nren.id,
'year': year,
'from_customers': from_customers,
'to_customers': to_customers,
'from_external': from_external,
'to_external': to_external
})
db.session.bulk_insert_mappings(presentation_models.TrafficVolume, inserts)
db.session.commit()
def db_nren_services_migration(nren_dict):
services = [s for s in db.session.scalars(select(presentation_models.Service))]
for service_info in excel_parser.fetch_nren_services_excel_data():
[service] = [s for s in services if s.name_key == service_info['service_name_key']]
abbrev = service_info['nren_name']
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
nren_service = presentation_models.NRENService(
nren=nren,
nren_id=nren.id,
year=service_info['year'],
service=service,
service_key=service.name_key,
product_name=service_info['product_name'],
additional_information=service_info['additional_information'],
official_description=service_info['official_description']
)
db.session.merge(nren_service)
db.session.commit()
def db_connected_proportion_migration(nren_dict):
db.session.execute(delete(presentation_models.ConnectedProportion).where(
presentation_models.ConnectedProportion.year < 2023))
remit = excel_parser.fetch_remit_excel_data()
nr_connected = excel_parser.fetch_nr_connected_excel_data()
market_share = excel_parser.fetch_market_share_excel_data()
users_served = excel_parser.fetch_users_served_excel_data()
data_by_nren = defaultdict(dict)
for key in itertools.chain(remit.keys(), nr_connected.keys(), market_share.keys(), users_served.keys()):
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
to_add = (nren.id, year, user_category)
data_by_nren[to_add].update({
'nren_id': nren.id,
'year': year,
'user_category': user_category,
'coverage': remit.get(key),
'number_connected': nr_connected.get(key, data_by_nren[to_add].get('number_connected')),
'market_share': market_share.get(key, data_by_nren[to_add].get('market_share')),
'users_served': users_served.get(key, data_by_nren[to_add].get('users_served'))
})
inserts = list(data_by_nren.values())
db.session.bulk_insert_mappings(presentation_models.ConnectedProportion, inserts)
db.session.commit()
def db_connectivity_level_migration(nren_dict):
db.session.execute(delete(presentation_models.ConnectivityLevel).where(
presentation_models.ConnectivityLevel.year < 2023))
typical_speed = excel_parser.fetch_typical_speed_excel_data()
highest_speed = excel_parser.fetch_highest_speed_excel_data()
highest_speed_proportion = excel_parser.fetch_highest_speed_proportion_excel_data()
data_by_nren = defaultdict(dict)
for key in itertools.chain(typical_speed.keys(), highest_speed.keys(), highest_speed_proportion.keys()):
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
to_add = (nren.id, year, user_category)
data_by_nren[to_add].update({
'nren_id': nren.id,
'year': year,
'user_category': user_category,
'typical_speed': typical_speed.get(key, data_by_nren[to_add].get('typical_speed')),
'highest_speed': highest_speed.get(key, data_by_nren[to_add].get('highest_speed')),
'highest_speed_proportion': highest_speed_proportion.get(
key, data_by_nren[to_add].get('highest_speed_proportion'))
})
inserts = list(data_by_nren.values())
db.session.bulk_insert_mappings(presentation_models.ConnectivityLevel, inserts)
db.session.commit()
def db_connection_carrier_migration(nren_dict):
db.session.execute(delete(presentation_models.ConnectionCarrier).where(
presentation_models.ConnectionCarrier.year < 2023))
carriers = excel_parser.fetch_carriers_excel_data()
inserts = []
for key, carry_mechanism in carriers.items():
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'user_category': user_category,
'carry_mechanism': carry_mechanism
})
db.session.bulk_insert_mappings(presentation_models.ConnectionCarrier, inserts)
db.session.commit()
def db_connectivity_growth_migration(nren_dict):
db.session.execute(delete(presentation_models.ConnectivityGrowth).where(
presentation_models.ConnectivityGrowth.year < 2023))
growth = excel_parser.fetch_growth_excel_data()
inserts = []
for key, growth_percent in growth.items():
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'user_category': user_category,
'growth': growth_percent
})
db.session.bulk_insert_mappings(presentation_models.ConnectivityGrowth, inserts)
db.session.commit()
def db_connectivity_load_migration(nren_dict):
db.session.execute(delete(presentation_models.ConnectivityLoad).where(
presentation_models.ConnectivityLoad.year < 2023))
average = excel_parser.fetch_average_traffic_excel_data()
peak = excel_parser.fetch_peak_traffic_excel_data()
all_entry_keys = set()
all_entry_keys.update(average.keys())
all_entry_keys.update(peak.keys())
inserts = []
for key in all_entry_keys:
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'user_category': user_category,
'average_load_from_institutions': average.get(key, (None, None))[0],
'average_load_to_institutions': average.get(key, (None, None))[1],
'peak_load_from_institutions': peak.get(key, (None, None))[0],
'peak_load_to_institutions': peak.get(key, (None, None))[1]
})
db.session.bulk_insert_mappings(presentation_models.ConnectivityLoad, inserts)
db.session.commit()
def db_remote_campuses_migration(nren_dict):
db.session.execute(delete(presentation_models.RemoteCampuses).where(
presentation_models.RemoteCampuses.year < 2023))
campuses = excel_parser.fetch_remote_campuses_excel_data()
inserts = []
for (abbrev, year, connectivity, country, connected_to_r_e) in campuses:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
connections = []
if country:
connections.append({'country': country, 'local_r_and_e_connection': connected_to_r_e})
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'remote_campus_connectivity': connectivity,
'connections': connections
})
db.session.bulk_insert_mappings(presentation_models.RemoteCampuses, inserts)
db.session.commit()
def db_dark_fibre_lease_migration(nren_dict):
db.session.execute(delete(presentation_models.DarkFibreLease).where(
presentation_models.DarkFibreLease.year < 2023))
data_rows = excel_parser.fetch_dark_fibre_iru_excel_data()
iru_duration = excel_parser.fetch_iru_duration_excel_data()
inserts = []
for (abbrev, year, iru, length_in_country, length_out_country) in data_rows:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'iru_or_lease': iru,
'fibre_length_in_country': length_in_country,
'fibre_length_outside_country': length_out_country,
'iru_duration': iru_duration.get((abbrev, year))
})
db.session.bulk_insert_mappings(presentation_models.DarkFibreLease, inserts)
db.session.commit()
def db_dark_fibre_installed_migration(nren_dict):
db.session.execute(delete(presentation_models.DarkFibreInstalled).where(
presentation_models.DarkFibreInstalled.year < 2023))
data_rows = excel_parser.fetch_dark_fibre_installed_excel_data()
inserts = []
for (abbrev, year, installed, length) in data_rows:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'installed': installed,
'fibre_length_in_country': length
})
db.session.bulk_insert_mappings(presentation_models.DarkFibreInstalled, inserts)
db.session.commit()
def db_passive_monitoring_migration(nren_dict):
db.session.execute(delete(presentation_models.PassiveMonitoring).where(
presentation_models.PassiveMonitoring.year < 2023))
data_rows = excel_parser.fetch_passive_monitoring_excel_data()
inserts = []
for (abbrev, year, monitoring, method) in data_rows:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'monitoring': monitoring,
'method': method
})
db.session.bulk_insert_mappings(presentation_models.PassiveMonitoring, inserts)
db.session.commit()
def db_capacity_migration(nren_dict):
db.session.execute(delete(presentation_models.Capacity).where(
presentation_models.Capacity.year < 2023))
largest_data_rows = excel_parser.fetch_largest_link_capacity_excel_data()
typical_data_rows = excel_parser.fetch_typical_backbone_capacity_excel_data()
by_nren = defaultdict(dict)
for key in itertools.chain(largest_data_rows.keys(), typical_data_rows.keys()):
(abbrev, year) = key
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
to_add = (nren.id, year)
by_nren[to_add].update({
'nren_id': nren.id,
'year': year,
'largest_link_capacity': largest_data_rows.get(key, by_nren[to_add].get('largest_link_capacity')),
'typical_backbone_capacity': typical_data_rows.get(key, by_nren[to_add].get('typical_backbone_capacity'))
})
inserts = list(by_nren.values())
db.session.bulk_insert_mappings(presentation_models.Capacity, inserts)
db.session.commit()
def db_non_r_e_peers_migration(nren_dict):
db.session.execute(delete(presentation_models.NonREPeers).where(
presentation_models.NonREPeers.year < 2023))
data_rows = excel_parser.fetch_non_r_e_peers_excel_data()
inserts = []
for (abbrev, year, nr_of_non_r_and_e_peers) in data_rows:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'nr_of_non_r_and_e_peers': nr_of_non_r_and_e_peers
})
db.session.bulk_insert_mappings(presentation_models.NonREPeers, inserts)
db.session.commit()
def db_ops_automation_migration(nren_dict):
db.session.execute(delete(presentation_models.OpsAutomation).where(
presentation_models.OpsAutomation.year < 2023))
data_rows = excel_parser.fetch_ops_automation_excel_data()
inserts = []
for (abbrev, year, automation, specifics) in data_rows:
if abbrev not in nren_dict:
logger.warning(f'{abbrev} unknown. Skipping.')
continue
nren = nren_dict[abbrev]
inserts.append({
'nren_id': nren.id,
'year': year,
'ops_automation': automation,
'ops_automation_specifics': specifics
})
db.session.bulk_insert_mappings(presentation_models.OpsAutomation, inserts)
db.session.commit()
def _cli(app):
with app.app_context():
nren_dict = helpers.get_uppercase_nren_dict()
db_budget_migration(nren_dict)
db_funding_migration(nren_dict)
db_charging_structure_migration(nren_dict)
db_staffing_migration(nren_dict)
db_ecprojects_migration(nren_dict)
db_organizations_migration(nren_dict)
db_traffic_volume_migration(nren_dict)
db_nren_services_migration(nren_dict)
db_connected_proportion_migration(nren_dict)
db_connectivity_level_migration(nren_dict)
db_connection_carrier_migration(nren_dict)
db_connectivity_growth_migration(nren_dict)
db_connectivity_load_migration(nren_dict)
db_remote_campuses_migration(nren_dict)
db_dark_fibre_lease_migration(nren_dict)
db_dark_fibre_installed_migration(nren_dict)
db_passive_monitoring_migration(nren_dict)
db_capacity_migration(nren_dict)
db_non_r_e_peers_migration(nren_dict)
db_ops_automation_migration(nren_dict)
@click.command()
@click.option('--config', type=click.STRING, default='config.json')
def cli(config):
app_config = load(open(config, 'r'))
app_config['SQLALCHEMY_BINDS'] = {survey_model.SURVEY_DB_BIND: app_config['SURVEY_DATABASE_URI']}
app = compendium_v2._create_app_with_db(app_config)
print("survey-publisher-v1 starting")
_cli(app)
if __name__ == "__main__":
cli()
"""
survey_publisher_old_db_2022
============================
This module loads the survey data from 2022 from the old survey database into presentation_models.
Registered as click cli command when installing compendium-v2.
"""
from decimal import Decimal
import logging
import click
import enum
import math
import json
import html
import itertools
from sqlalchemy import delete, text
from collections import defaultdict
import compendium_v2
from compendium_v2.conversion.mapping import CHARGING_LEVELS, CONNECTION, INTERCONNECTION, SERVICE_USER_TYPE_TO_CODE
from compendium_v2.db.presentation_model_enums import CommercialCharges, CommercialConnectivityCoverage, \
ConnectionMethod, FeeType, ServiceCategory, UserCategory, YesNoPlanned
from compendium_v2.environment import setup_logging
from compendium_v2.config import load
from compendium_v2.publishers.helpers import extract_urls, valid_url
from compendium_v2.survey_db import model as survey_model
from compendium_v2.db import db, presentation_models
from compendium_v2.publishers import helpers
from compendium_v2.conversion import mapping
setup_logging()
logger = logging.getLogger('survey-publisher-old-db-2022')
BUDGET_QUERY = """
SELECT DISTINCT ON (n.id, a.question_id)
n.abbreviation AS nren,
a.value AS budget
FROM answers a
JOIN nrens n ON a.nren_id = n.id
JOIN questions q ON a.question_id = q.id
JOIN sections s ON q.section_id = s.id
JOIN compendia c ON s.compendium_id = c.id
WHERE
a.question_id = 16402
AND c.year = 2022
ORDER BY n.id, a.question_id, a.updated_at DESC
"""
QUESTION_TEMPLATE_QUERY = """
SELECT DISTINCT ON (n.id, a.question_id)
n.abbreviation AS nren,
a.value AS value
FROM answers a
JOIN nrens n ON a.nren_id = n.id
JOIN questions q ON a.question_id = q.id
JOIN sections s ON q.section_id = s.id
JOIN compendia c ON s.compendium_id = c.id
WHERE
a.question_id = {}
AND c.year = {}
AND a.value NOT IN ('"NA"', '"N/A"', '[""]', '["-"]', '["/"]')
ORDER BY n.id, a.question_id, a.updated_at DESC
"""
RECURSIVE_QUERY = """
WITH RECURSIVE parent_questions AS (
-- Base case
SELECT q.id, q.equivalent_question_id, c.year, q.title
FROM questions q
JOIN sections s ON q.section_id = s.id
JOIN compendia c ON s.compendium_id = c.id
WHERE q.id = {}
UNION ALL
-- Recursive case
SELECT q.id, q.equivalent_question_id, c.year, q.title
FROM questions q
INNER JOIN parent_questions pq ON q.id = pq.equivalent_question_id
JOIN sections s ON q.section_id = s.id
JOIN compendia c ON s.compendium_id = c.id)
SELECT DISTINCT ON (n.id, answers.question_id) answers.id,
UPPER(n.abbreviation) AS nren,
parent_questions.year,
answers.value as answer
FROM answers
JOIN parent_questions ON answers.question_id = parent_questions.id
JOIN nrens n on answers.nren_id = n.id
WHERE UPPER(answers.value) NOT IN ('"NA"', '"N/A"', '[""]', '["-"]', '["/"]', '/', '["NA"]', '""', '[]', '[n/a]')
ORDER BY n.id, answers.question_id, answers.updated_at DESC;
"""
class FundingSource(enum.Enum):
CLIENT_INSTITUTIONS = 16405
EUROPEAN_FUNDING = 16406
COMMERCIAL = 16407
OTHER = 16408
GOV_PUBLIC_BODIES = 16409
class StaffQuestion(enum.Enum):
"""
Answers are numbers expressed in FTEs (full time equivalents)
"""
PERMANENT_FTE = 16414
SUBCONTRACTED_FTE = 16413
TECHNICAL_FTE = 16416
NON_TECHNICAL_FTE = 16417
class OrgQuestion(enum.Enum):
"""
Answers are strings
"""
PARENT_ORG_NAME = 16419
SUB_ORGS_1_NAME = 16422
SUB_ORGS_1_CHOICE = 16449
SUB_ORGS_1_ROLE = 16426
SUB_ORGS_2_NAME = 16429
SUB_ORGS_2_CHOICE = 16448
SUB_ORGS_2_ROLE = 16434
SUB_ORGS_3_NAME = 16430
SUB_ORGS_3_CHOICE = 16446
SUB_ORGS_3_ROLE = 16435
SUB_ORGS_4_NAME = 16432
SUB_ORGS_4_CHOICE = 16451
SUB_ORGS_4_ROLE = 16438
SUB_ORGS_5_NAME = 16433
SUB_ORGS_5_CHOICE = 16450
SUB_ORGS_5_ROLE = 16439
class ECQuestion(enum.Enum):
EC_PROJECT = 16453
class ChargingStructure(enum.Enum):
"""
Answers are strings
"""
charging_structure = 16410
def query_budget():
return db.session.execute(text(BUDGET_QUERY), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]})
def recursive_query(question_id_2022):
assert question_id_2022
query = RECURSIVE_QUERY.format(question_id_2022)
return db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]})
def query_funding_sources():
for source in FundingSource:
query = QUESTION_TEMPLATE_QUERY.format(source.value, 2022)
yield source, db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]})
def query_question(question: enum.Enum):
return query_question_id(question.value)
def query_question_id(question_id: int, year: int = 2022):
query = QUESTION_TEMPLATE_QUERY.format(question_id, year)
return db.session.execute(text(query), bind_arguments={'bind': db.engines[survey_model.SURVEY_DB_BIND]})
def _parse_json_urls(value, nren_name):
if value and not value.startswith('['):
value = f'[{value}]'
try:
return [url.strip().strip('/') for url in json.loads(value) if url.strip()]
except json.decoder.JSONDecodeError:
logger.info(f'JSON decode error for urls for {nren_name}.')
return []
def transfer_budget(nren_dict):
rows = query_budget()
db.session.execute(delete(presentation_models.BudgetEntry).where(
presentation_models.BudgetEntry.year == 2022)
)
for row in rows:
nren_name = row[0].upper()
_budget = row[1]
try:
budget = float(_budget.replace('"', '').replace(',', ''))
except ValueError:
logger.info(f'{nren_name} has no budget for 2022. Skipping. ({_budget}))')
continue
if budget > 400:
logger.info(f'{nren_name} has budget set to >400M EUR for 2022. ({budget})')
continue
if budget == 0:
logger.info(f'{nren_name} has budget set to 0 for 2022. Skipping.')
continue
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
budget_entry = presentation_models.BudgetEntry(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
budget=budget,
year=2022,
)
db.session.merge(budget_entry)
db.session.commit()
def transfer_institutions_urls(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.InstitutionURLs).where(
presentation_models.InstitutionURLs.year <= 2022)
)
rows = recursive_query(16507)
for row in rows:
answer_id, nren_name, year, answer = row
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
urls = extract_urls(text=answer)
urls_json = _parse_json_urls(answer, nren_name)
if urls != urls_json:
logger.info(f'Institution URLs for {nren_name} do not match between json and regex. {urls} != {urls_json}')
if not urls:
logger.info(f'{nren_name} has no urls for {year}. Skipping.')
continue
valid_urls = []
for url in urls:
if not valid_url(url):
logger.warning(f'Invalid institution URL for {nren_name} in {year}: {url}. Skipping.')
continue
valid_urls.append(url)
if not valid_urls:
logger.info(f'{nren_name} has no valid urls for {year}. Skipping.')
continue
institution_urls = presentation_models.InstitutionURLs(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
urls=valid_urls,
year=year,
)
db.session.merge(institution_urls)
db.session.commit()
def transfer_funding_sources(nren_dict):
sourcedata = {}
db.session.execute(delete(presentation_models.FundingSource).where(
presentation_models.FundingSource.year == 2022)
)
for source, data in query_funding_sources():
for row in data:
nren_name = row[0].upper()
_value = row[1]
try:
value = float(_value.replace('"', '').replace(',', ''))
except ValueError:
name = source.name
logger.info(f'{nren_name} has invalid value for {name}. ({_value}))')
value = 0
nren_info = sourcedata.setdefault(
nren_name,
{source_type: 0 for source_type in FundingSource}
)
nren_info[source] = value
for nren_name, nren_info in sourcedata.items():
total = sum(nren_info.values())
if not math.isclose(total, 100, abs_tol=0.01):
logger.info(f'{nren_name} funding sources do not sum to 100%. ({total})')
continue
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
if nren_name == 'HEANET':
nren_info[FundingSource.OTHER] = nren_info[FundingSource.OTHER] + nren_info[FundingSource.COMMERCIAL]
nren_info[FundingSource.COMMERCIAL] = 0
funding_source = presentation_models.FundingSource(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
client_institutions=nren_info[FundingSource.CLIENT_INSTITUTIONS],
european_funding=nren_info[FundingSource.EUROPEAN_FUNDING],
gov_public_bodies=nren_info[FundingSource.GOV_PUBLIC_BODIES],
commercial=nren_info[FundingSource.COMMERCIAL],
other=nren_info[FundingSource.OTHER],
)
db.session.merge(funding_source)
db.session.commit()
def transfer_staff_data(nren_dict):
data = {}
db.session.execute(delete(presentation_models.NrenStaff).where(
presentation_models.NrenStaff.year == 2022)
)
for question in StaffQuestion:
rows = query_question(question)
for row in rows:
nren_name = row[0].upper()
_value = row[1]
try:
value = float(_value.replace('"', '').replace(',', ''))
except ValueError:
value = 0
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
# initialize on first use, so we don't add data for nrens with no answers
data.setdefault(nren_name, {question: 0 for question in StaffQuestion})[question] = value
for nren_name, nren_info in data.items():
if sum([nren_info[question] for question in StaffQuestion]) == 0:
logger.info(f'{nren_name} has no staff data. Deleting if exists.')
db.session.execute(delete(presentation_models.NrenStaff).where(
presentation_models.NrenStaff.nren_id == nren_dict[nren_name].id,
presentation_models.NrenStaff.year == 2022
))
continue
employed = nren_info[StaffQuestion.PERMANENT_FTE] + nren_info[StaffQuestion.SUBCONTRACTED_FTE]
technical = nren_info[StaffQuestion.TECHNICAL_FTE] + nren_info[StaffQuestion.NON_TECHNICAL_FTE]
if not math.isclose(employed, technical, abs_tol=0.01):
logger.info(f'{nren_name} FTE do not equal across employed/technical categories.'
f' ({employed} != {technical})')
continue
staff_data = presentation_models.NrenStaff(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
permanent_fte=nren_info[StaffQuestion.PERMANENT_FTE],
subcontracted_fte=nren_info[StaffQuestion.SUBCONTRACTED_FTE],
technical_fte=nren_info[StaffQuestion.TECHNICAL_FTE],
non_technical_fte=nren_info[StaffQuestion.NON_TECHNICAL_FTE],
)
db.session.merge(staff_data)
db.session.commit()
def transfer_nren_parent_org(nren_dict):
# clean up the data a bit by removing some strings
strings_to_replace = [
'We are affiliated to '
]
db.session.execute(delete(presentation_models.ParentOrganization).where(
presentation_models.ParentOrganization.year == 2022)
)
rows = query_question(OrgQuestion.PARENT_ORG_NAME)
for row in rows:
nren_name = row[0].upper()
value = str(row[1]).replace('"', '')
if not value:
continue
for string in strings_to_replace:
value = value.replace(string, '')
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
parent_org = presentation_models.ParentOrganization(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
organization=value,
)
db.session.merge(parent_org)
db.session.commit()
def transfer_nren_sub_org(nren_dict):
suborg_questions = [
(OrgQuestion.SUB_ORGS_1_NAME, OrgQuestion.SUB_ORGS_1_CHOICE, OrgQuestion.SUB_ORGS_1_ROLE),
(OrgQuestion.SUB_ORGS_2_NAME, OrgQuestion.SUB_ORGS_2_CHOICE, OrgQuestion.SUB_ORGS_2_ROLE),
(OrgQuestion.SUB_ORGS_3_NAME, OrgQuestion.SUB_ORGS_3_CHOICE, OrgQuestion.SUB_ORGS_3_ROLE),
(OrgQuestion.SUB_ORGS_4_NAME, OrgQuestion.SUB_ORGS_4_CHOICE, OrgQuestion.SUB_ORGS_4_ROLE),
(OrgQuestion.SUB_ORGS_5_NAME, OrgQuestion.SUB_ORGS_5_CHOICE, OrgQuestion.SUB_ORGS_5_ROLE)
]
lookup = defaultdict(list)
db.session.execute(delete(presentation_models.SubOrganization).where(
presentation_models.SubOrganization.year == 2022)
)
for name, choice, role in suborg_questions:
_name_rows = query_question(name)
_choice_rows = query_question(choice)
_role_rows = list(query_question(role))
for _name, _choice in zip(_name_rows, _choice_rows):
nren_name = _name[0].upper()
suborg_name = _name[1].replace('"', '').strip()
role_choice = _choice[1].replace('"', '').strip()
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping.')
continue
if role_choice.lower() == 'other':
for _role in _role_rows:
if _role[0] == _name[0]:
role = _role[1].replace('"', '').strip()
break
else:
role = role_choice
if not role:
continue
lookup[nren_name].append((suborg_name, role))
for nren_name, suborgs in lookup.items():
for suborg_name, role in suborgs:
suborg = presentation_models.SubOrganization(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
organization=suborg_name,
role=role,
)
db.session.merge(suborg)
db.session.commit()
def transfer_charging_structure(nren_dict):
db.session.execute(delete(presentation_models.ChargingStructure).where(
presentation_models.ChargingStructure.year == 2022)
)
rows = query_question(ChargingStructure.charging_structure)
for row in rows:
nren_name = row[0].upper()
value = row[1].replace('"', '').strip()
if nren_name not in nren_dict:
logger.info(f'{nren_name} unknown. Skipping from charging structure.')
continue
if "do not charge" in value:
charging_structure = FeeType.no_charge
elif "combination" in value:
charging_structure = FeeType.combination
elif "flat" in value:
charging_structure = FeeType.flat_fee
elif "usage-based" in value:
charging_structure = FeeType.usage_based_fee
elif "Other" in value:
charging_structure = FeeType.other
else:
charging_structure = None
charging_structure = presentation_models.ChargingStructure(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
fee_type=charging_structure,
)
db.session.merge(charging_structure)
db.session.commit()
def transfer_ec_projects(nren_dict):
# delete all existing EC projects, in case something changed
db.session.execute(
delete(presentation_models.ECProject).where(presentation_models.ECProject.year == 2022)
)
rows = query_question(ECQuestion.EC_PROJECT)
for row in rows:
nren_name = row[0].upper()
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
try:
value = json.loads(row[1])
except json.decoder.JSONDecodeError:
logger.info(f'JSON decode error for EC project data for {nren_name}. Skipping.')
continue
for val in value:
if not val:
logger.info(f'Empty EC project value for {nren_name}.')
continue
# strip html entities/NBSP from val
val = html.unescape(val).replace('\xa0', ' ')
# some answers include contract numbers, which we don't want here
val = val.split('(contract n')[0]
ec_project = presentation_models.ECProject(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=2022,
project=str(val).strip()
)
db.session.add(ec_project)
db.session.commit()
def transfer_policies(nren_dict):
"""
Answers are strings that should be urls, but sometimes there's other stuff
like email addresses or random text
"""
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.Policy).where(
presentation_models.Policy.year <= 2022)
)
policy_questions = {
'strategy': {2022: 16469, 2021: 16064, 2020: 15720, 2019: 15305, 2018: 14910},
'environment': {2022: 16471, 2021: 16066, 2020: 15722, 2019: 15307, 2018: 14912},
'equality': {2022: 16473, 2021: 16378},
'connectivity': {2022: 16475, 2021: 16068, 2020: 15724, 2019: 15309, 2018: 14914},
'acceptable_use': {2022: 16477, 2021: 16070, 2020: 15726, 2019: 15311, 2018: 14916},
'privacy': {2022: 16479, 2021: 16072, 2020: 15728, 2019: 15575},
'data_protection': {2022: 16481, 2021: 16074, 2020: 15730, 2019: 15577},
'gender': {2022: 16761}
}
data = {}
for year in [2018, 2019, 2020, 2021, 2022]:
policy_questions_year = {key: years[year] for key, years in policy_questions.items() if year in years}
for question_key, question_id in policy_questions_year.items():
rows = query_question_id(question_id, year)
for row in rows:
nren_name = row[0].upper()
_value = row[1]
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
value = _value.split()[0].strip('"')
if not value:
# don't warn on empty answers, just skip
continue
if value.upper() == 'N.A.' or ('.' not in value and '@' not in value):
# this test is a bit silly but does seem to filter out all the nonsense responses
logger.warning(f'"{value}" does not look like an email address or link. Skipping.')
continue
if _value not in [f'"{value}"', value]:
logger.info(f'Cleaned policy answer: "{_value}" became "{value}"')
# initialize on first use, so we don't add data for nrens with no answers
data.setdefault((nren_name, year), {q: '' for q in policy_questions.keys()})
data[(nren_name, year)][question_key] = value
for (nren_name, year), nren_info in data.items():
strategy = nren_info['strategy']
if strategy and not valid_url(strategy):
strategy = ''
environment = nren_info['environment']
if environment and not valid_url(environment):
environment = ''
equality = nren_info['equality']
if equality and not valid_url(equality):
equality = ''
connectivity = nren_info['connectivity']
if connectivity and not valid_url(connectivity):
connectivity = ''
acceptable_use = nren_info['acceptable_use']
if acceptable_use and not valid_url(acceptable_use):
acceptable_use = ''
privacy = nren_info['privacy']
if privacy and not valid_url(privacy):
privacy = ''
data_protection = nren_info['data_protection']
if data_protection and not valid_url(data_protection):
data_protection = ''
gender_equality = nren_info['gender']
if gender_equality and not valid_url(gender_equality):
gender_equality = ''
all_policies = [strategy, environment, equality, connectivity,
acceptable_use, privacy, data_protection, gender_equality]
if not any(all_policies):
logger.info(f'{nren_name} has no policy data. Skipping.')
continue
policy_data = presentation_models.Policy(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
strategic_plan=strategy,
environmental=environment,
equal_opportunity=equality,
connectivity=connectivity,
acceptable_use=acceptable_use,
privacy_notice=privacy,
data_protection=data_protection,
gender_equality=gender_equality
)
db.session.merge(policy_data)
db.session.commit()
def transfer_central_procurement(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.CentralProcurement).where(
presentation_models.CentralProcurement.year <= 2022)
)
rows = recursive_query(16482)
amounts = recursive_query(16483)
amounts = {(nren_name, year): Decimal(answer.strip('"')) for answer_id, nren_name, year, answer in amounts}
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.CentralProcurement(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
central_procurement=answer == '"Yes"',
amount=amounts.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_service_management(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.ServiceManagement).where(
presentation_models.ServiceManagement.year <= 2022)
)
framework = recursive_query(16484)
framework = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in framework}
targets = recursive_query(16485)
targets = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in targets}
for nren_name, year in framework.keys() | targets.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.ServiceManagement(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
service_management_framework=framework.get((nren_name, year)),
service_level_targets=targets.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_service_user_types(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.ServiceUserTypes).where(
presentation_models.ServiceUserTypes.year <= 2022)
)
categories = [
(ServiceCategory.identity, 16488),
(ServiceCategory.network_services, 16489),
(ServiceCategory.collaboration, 16490),
(ServiceCategory.security, 16491),
(ServiceCategory.isp_support, 16492),
(ServiceCategory.storage_and_hosting, 16493),
(ServiceCategory.multimedia, 16494),
(ServiceCategory.professional_services, 16495)
]
for service_category, question_id in categories:
rows = recursive_query(question_id)
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
for user_cat_db in json.loads(answer):
user_cat = UserCategory[SERVICE_USER_TYPE_TO_CODE[user_cat_db]]
new_entry = presentation_models.ServiceUserTypes(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
user_category=user_cat,
service_category=service_category
)
db.session.merge(new_entry)
db.session.commit()
def transfer_standards(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.Standards).where(
presentation_models.Standards.year <= 2022)
)
audits = recursive_query(16499)
audits = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in audits}
audit_specifics = recursive_query(16500)
audit_specifics = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in audit_specifics}
bcp = recursive_query(16501)
bcp = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in bcp}
bcp_specifics = recursive_query(16502)
bcp_specifics = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in bcp_specifics}
cmp = recursive_query(16762)
cmp = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in cmp}
for nren_name, year in audits.keys() | audit_specifics.keys() | bcp.keys() | bcp_specifics.keys() | cmp.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.Standards(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
audits=audits.get((nren_name, year)),
audit_specifics=audit_specifics.get((nren_name, year), ""),
business_continuity_plans=bcp.get((nren_name, year)),
business_continuity_plans_specifics=bcp_specifics.get((nren_name, year), ""),
crisis_management_procedure=cmp.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_crisis_exercises(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.CrisisExercises).where(
presentation_models.CrisisExercises.year <= 2022)
)
rows = recursive_query(16763)
crisis_exercises_map = {
"geant_workshops": "We participate in GEANT Crisis workshops such as CLAW",
"national_excercises": "We participated in National crisis exercises ",
"tabletop_exercises": "We run our own tabletop exercises",
"simulation_excercises": "We run our own simulation exercises",
"other_excercises": "We have done/participated in other exercises or trainings",
"real_crisis": "We had a real crisis",
"internal_security_programme": "We run an internal security awareness programme",
"none": "No, we have not done any crisis exercises or trainings",
}
_reversed_map = {v: k for k, v in crisis_exercises_map.items()}
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
descriptions = json.loads(answer)
new_entry = presentation_models.CrisisExercises(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
exercise_descriptions=[_reversed_map.get(desc, desc) for desc in descriptions]
)
db.session.merge(new_entry)
db.session.commit()
def transfer_security_controls(nren_dict):
controls_map = {
"anti_virus": "Anti Virus",
"anti_spam": "Anti-Spam",
"firewall": "Firewall",
"ddos_mitigation": "DDoS mitigation",
"monitoring": "Network monitoring",
"ips_ids": "IPS/IDS",
"acl": "ACL",
"segmentation": "Network segmentation",
"integrity_checking": "Integrity checking"
}
reversed_map = {v: k for k, v in controls_map.items()}
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.SecurityControls).where(
presentation_models.SecurityControls.year <= 2022)
)
sc = recursive_query(16503)
sc = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in sc}
sc_other = recursive_query(16504)
sc_other = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in sc_other}
for key, value in sc_other.items():
if not isinstance(value, list):
sc_other[key] = [value]
for nren_name, year in sc.keys() | sc_other.keys():
if year < 2021: # prior to 2022, the mapping is different, use a different data source
continue
# TODO: import the pre-2022 data from a handmade CSV.
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
full_list = sc.get((nren_name, year), [])
other_entries = [e.strip() for e in sc_other.get((nren_name, year), [])
if e.strip() and e.lower() not in ["n/a", "-"]]
other_entry = ", ".join(other_entries)
if other_entry:
full_list.append(other_entry)
if "Other" in full_list:
full_list.remove("Other")
full_list = [reversed_map.get(control, control) for control in full_list]
if full_list:
new_entry = presentation_models.SecurityControls(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
security_control_descriptions=full_list
)
db.session.merge(new_entry)
db.session.commit()
def transfer_eosc_listings(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.EOSCListings).where(
presentation_models.EOSCListings.year <= 2022)
)
rows = recursive_query(16497)
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.EOSCListings(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
service_names=[x for x in json.loads(answer) if x]
)
db.session.merge(new_entry)
db.session.commit()
def transfer_commercial_connectivity(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.CommercialConnectivity).where(
presentation_models.CommercialConnectivity.year <= 2022)
)
simple_connection = {
key.replace(" ", "").replace("-", "").replace("/", "").lower(): value for key, value in CONNECTION.items()
}
def get_coverage(db_string):
cleaned_str = db_string.strip('"').replace(" ", "").replace("-", "").replace("/", "").lower()
key = simple_connection[cleaned_str]
return CommercialConnectivityCoverage[key]
sp = recursive_query(16646)
sp = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in sp}
collab = recursive_query(16647)
collab = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in collab}
r_e = recursive_query(16648)
r_e = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in r_e}
general = recursive_query(16649)
general = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in general}
spin_off = recursive_query(16650)
spin_off = {(nren_name, year): get_coverage(answer) for answer_id, nren_name, year, answer in spin_off}
for nren_name, year in sp.keys() | collab.keys() | r_e.keys() | general.keys() | spin_off.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.CommercialConnectivity(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
commercial_r_and_e=r_e.get((nren_name, year)),
commercial_general=general.get((nren_name, year)),
commercial_collaboration=collab.get((nren_name, year)),
commercial_service_provider=sp.get((nren_name, year)),
university_spin_off=spin_off.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_commercial_charging_level(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.CommercialChargingLevel).where(
presentation_models.CommercialChargingLevel.year <= 2022)
)
simple_charging = {
key.replace(" ", "").replace("-", "").replace("/", "").lower(): value for key, value in CHARGING_LEVELS.items()
}
simple_charging["nochargesapplied"] = "no_charges_if_r_e_requested"
simple_charging['nochargesappliedifrequestedbyr+eusers\\"needed?'] = "no_charges_if_r_e_requested"
def get_charging(db_string):
if db_string[0] == '[':
db_string = json.loads(db_string)[0]
cleaned_str = db_string.strip('"').replace(" ", "").replace("-", "").replace("/", "").lower()
key = simple_charging[cleaned_str]
return CommercialCharges[key]
collab = recursive_query(16652)
collab = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in collab}
services = recursive_query(16653)
services = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in services}
peering = recursive_query(16654)
peering = {(nren_name, year): get_charging(answer) for answer_id, nren_name, year, answer in peering}
for nren_name, year in collab.keys() | services.keys() | peering.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.CommercialChargingLevel(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
collaboration=collab.get((nren_name, year)),
service_supplier=services.get((nren_name, year)),
direct_peering=peering.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_fibre_light(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.FibreLight).where(
presentation_models.FibreLight.year <= 2022)
)
comment_map = mapping.VALUE_TO_CODE_MAPPING.get(16668)
fibre = recursive_query(16668)
fibre = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in fibre}
fibre_comment = recursive_query(16669)
fibre_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in fibre_comment}
for nren_name, year in fibre.keys() | fibre_comment.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
description = fibre.get((nren_name, year))
comment = fibre_comment.get((nren_name, year))
if description and description[0:5] != "Other":
if comment and comment.replace("-", "") != "":
logger.warning(
f'fibre light comment while description is not "Other": {description} {comment} {nren_name}.'
)
else:
description = comment
if description:
description = comment_map.get(description, description).replace("\\", "")
new_entry = presentation_models.FibreLight(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
light_description=description
)
db.session.merge(new_entry)
db.session.commit()
def transfer_network_map_urls(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.NetworkMapUrls).where(
presentation_models.NetworkMapUrls.year <= 2022)
)
rows = recursive_query(16670)
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
urls = extract_urls(text=answer)
urls_json = _parse_json_urls(answer, nren_name)
if urls != urls_json:
logger.info(f'Institution URLs for {nren_name} do not match between json and regex. {urls} != {urls_json}')
if not urls:
logger.info(f'{nren_name} has no urls for {year}. Skipping.')
continue
new_entry = presentation_models.NetworkMapUrls(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
urls=urls
)
db.session.merge(new_entry)
db.session.commit()
def transfer_traffic_statistics(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.TrafficStatistics).where(
presentation_models.TrafficStatistics.year <= 2022)
)
stats = recursive_query(16677)
stat_urls = recursive_query(16678)
stat_urls = {(nren_name, year): answer for answer_id, nren_name, year, answer in stat_urls}
for answer_id, nren_name, year, answer in stats:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
db_urls = stat_urls.get((nren_name, year))
if db_urls:
urls = extract_urls(text=db_urls)
urls_json = _parse_json_urls(db_urls, nren_name)
if urls != urls_json:
logger.info(
f'Traffic stat URLs for {nren_name} do not match between json and regex. {urls} != {urls_json}'
)
db_urls = urls
else:
db_urls = []
valid_urls = []
for url in db_urls:
if valid_url(url):
valid_urls.append(url)
else:
logger.warning(f'Invalid Traffic Statistics URL for {nren_name} in {year}: {url}. Skipping.')
if not valid_urls:
logger.info(f'{nren_name} has no valid urls for {year}. Skipping.')
continue
new_entry = presentation_models.TrafficStatistics(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
traffic_statistics=answer == '"Yes"',
urls=db_urls
)
db.session.merge(new_entry)
db.session.commit()
def transfer_siem_vendors(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.SiemVendors).where(
presentation_models.SiemVendors.year <= 2022)
)
vendors = recursive_query(16679)
vendors = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in vendors}
vendor_comment = recursive_query(16680)
vendor_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in vendor_comment}
for nren_name, year in vendors.keys() | vendor_comment.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
vendor_names = vendors.get((nren_name, year), [])
comment = vendor_comment.get((nren_name, year))
if comment:
vendor_names.append(comment)
vendor_names.remove("Other")
new_entry = presentation_models.SiemVendors(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
vendor_names=vendor_names
)
db.session.merge(new_entry)
db.session.commit()
def transfer_certificate_providers(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.CertificateProviders).where(
presentation_models.CertificateProviders.year <= 2022)
)
provider_map = mapping.VALUE_TO_CODE_MAPPING.get(16681)
providers = recursive_query(16681)
providers = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in providers}
prov_comment = recursive_query(16682)
prov_comment = {(nren_name, year): answer.strip('"') for answer_id, nren_name, year, answer in prov_comment}
for nren_name, year in providers.keys() | prov_comment.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
provider_names = providers.get((nren_name, year), [])
comment = prov_comment.get((nren_name, year))
if comment:
provider_names.append(comment)
if "Other" in provider_names:
provider_names.remove("Other")
def _replace_provider(provider):
if 'let' in provider.lower() and 'encrypt' in provider.lower():
return "Let's Encrypt"
return provider_map.get(provider, provider)
provider_names = [_replace_provider(p) for p in provider_names]
new_entry = presentation_models.CertificateProviders(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
provider_names=provider_names
)
db.session.merge(new_entry)
db.session.commit()
def transfer_weather_map(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.WeatherMap).where(
presentation_models.WeatherMap.year <= 2022)
)
weather = recursive_query(16683)
urls = recursive_query(16684)
urls = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in urls}
for answer_id, nren_name, year, answer in weather:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
url = urls.get((nren_name, year), "")
if url:
found_urls = extract_urls(text=url)
if found_urls:
url = found_urls[0]
else:
url = ""
orig_url = urls.get((nren_name, year), "").strip("/")
if url != orig_url:
logger.info(f'Weather URL for {nren_name} do not match between json and regex. {url} != {orig_url}')
valid = valid_url(url)
if not valid:
logger.warning(f'Invalid WeatherMap URL for {nren_name} in {year}: {url}. Skipping.')
continue
new_entry = presentation_models.WeatherMap(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
weather_map=answer == '"Yes"',
url=url
)
db.session.merge(new_entry)
db.session.commit()
def transfer_pert_team(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.PertTeam).where(
presentation_models.PertTeam.year <= 2022)
)
rows = recursive_query(16685)
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
if answer == "null":
continue
pert = YesNoPlanned[answer.strip('"').lower()]
new_entry = presentation_models.PertTeam(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
pert_team=pert
)
db.session.merge(new_entry)
db.session.commit()
def transfer_alien_wave(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.AlienWave).where(
presentation_models.AlienWave.year <= 2022)
)
alien = recursive_query(16687)
alien = {
(nren_name, year): YesNoPlanned[answer.strip('"').lower()] for answer_id, nren_name, year, answer in alien
}
nr = recursive_query(16688)
nr = {(nren_name, year): int(answer.strip('"')) for answer_id, nren_name, year, answer in nr}
internal = recursive_query(16689)
internal = {(nren_name, year): answer == '"Yes"' for answer_id, nren_name, year, answer in internal}
for nren_name, year in alien.keys() | nr.keys() | internal.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
new_entry = presentation_models.AlienWave(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
alien_wave_third_party=alien.get((nren_name, year)),
nr_of_alien_wave_third_party_services=nr.get((nren_name, year)),
alien_wave_internal=internal.get((nren_name, year))
)
db.session.merge(new_entry)
db.session.commit()
def transfer_external_connections(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.ExternalConnections).where(
presentation_models.ExternalConnections.year <= 2022)
)
question_nrs = {
16694: (5, "capacity"),
16695: (7, "capacity"),
16696: (6, "capacity"),
16697: (7, "from_organization"),
16698: (1, "to_organization"),
16699: (8, "to_organization"),
16700: (9, "to_organization"),
16701: (1, "from_organization"),
16702: (8, "capacity"),
16703: (5, "to_organization"),
16704: (0, "link_name"),
16705: (1, "link_name"),
16706: (9, "capacity"),
16707: (2, "link_name"),
16708: (0, "from_organization"),
16709: (4, "link_name"),
16710: (3, "link_name"),
16711: (9, "link_name"),
16712: (7, "link_name"),
16713: (8, "link_name"),
16714: (6, "link_name"),
16715: (5, "link_name"),
16716: (4, "from_organization"),
16717: (5, "from_organization"),
16718: (6, "from_organization"),
16719: (2, "to_organization"),
16720: (3, "to_organization"),
16721: (4, "to_organization"),
16722: (6, "to_organization"),
16723: (7, "to_organization"),
16724: (2, "interconnection_method"),
16725: (3, "interconnection_method"),
16726: (4, "interconnection_method"),
16727: (5, "interconnection_method"),
16728: (8, "from_organization"),
16729: (9, "from_organization"),
16730: (0, "to_organization"),
16731: (0, "capacity"),
16732: (1, "capacity"),
16733: (2, "capacity"),
16734: (3, "capacity"),
16735: (4, "capacity"),
16736: (3, "from_organization"),
16737: (2, "from_organization"),
16738: (1, "interconnection_method"),
16739: (7, "interconnection_method"),
16740: (8, "interconnection_method"),
16741: (0, "interconnection_method"),
16742: (9, "interconnection_method"),
16743: (6, "interconnection_method")
}
def empty_connection_dict():
return {'link_name': '', 'capacity': None, 'from_organization': '',
'to_organization': '', 'interconnection_method': None}
connection_dicts = {}
nren_year_set = set()
for question_id, (connection_nr, field) in question_nrs.items():
rows = recursive_query(question_id)
for answer_id, nren_name, year, answer in rows:
nren_year_set.add((nren_name, year))
conn_dict = connection_dicts.setdefault((nren_name, year, connection_nr), empty_connection_dict())
conn_dict[field] = answer.strip('" ')
int_simple = {key.replace(" ", "").lower(): value for key, value in INTERCONNECTION.items()}
int_simple['openexchangepoi'] = "open_exchange"
for conn_dict in connection_dicts.values():
if conn_dict['capacity']:
try:
conn_dict['capacity'] = str(Decimal(conn_dict['capacity'].split('G')[0].strip()))
except: # noqa: E722
logger.warning(f'Capacity could not be converted for {nren_name}: {conn_dict["capacity"]}.')
conn_dict['capacity'] = None
if conn_dict['interconnection_method']:
int_conn = int_simple[conn_dict['interconnection_method'].replace(" ", "").lower()]
conn_dict['interconnection_method'] = ConnectionMethod[int_conn].value
for nren_name, year in nren_year_set:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
connections = []
for connection_nr in range(0, 10):
conn = connection_dicts.get((nren_name, year, connection_nr))
if conn:
connections.append(conn)
new_entry = presentation_models.ExternalConnections(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
connections=connections
)
db.session.merge(new_entry)
db.session.commit()
def transfer_network_automation(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.NetworkAutomation).where(
presentation_models.NetworkAutomation.year <= 2022)
)
network_automation_map = mapping.VALUE_TO_CODE_MAPPING.get(16758)
rows = recursive_query(16757)
tasks = recursive_query(16758)
tasks = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in tasks}
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
network_automation = YesNoPlanned[answer.strip('"').lower()]
specifics = tasks.get((nren_name, year), [])
specifics = [network_automation_map.get(s, s) for s in specifics if s]
new_entry = presentation_models.NetworkAutomation(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
network_automation=network_automation,
network_automation_specifics=specifics
)
db.session.merge(new_entry)
db.session.commit()
def transfer_network_function_virtualisation(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.NetworkFunctionVirtualisation).where(
presentation_models.NetworkFunctionVirtualisation.year <= 2022)
)
nfv_map = mapping.VALUE_TO_CODE_MAPPING.get(16755)
rows = recursive_query(16754)
types = recursive_query(16755)
types = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in types}
types_comment = recursive_query(16756)
types_comment = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in types_comment}
for answer_id, nren_name, year, answer in rows:
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
nfv = YesNoPlanned[answer.strip('"').lower()]
specifics = types.get((nren_name, year), [])
specifics = list(itertools.chain(*[s.split(', ') for s in specifics if s]))
comment = types_comment.get((nren_name, year), "").replace("-", "")
if comment:
specifics.append(comment)
if "Other" in specifics:
specifics.remove("Other")
converted_specifics = []
for specific in specifics:
converted_specifics.append(nfv_map.get(specific, specific))
new_entry = presentation_models.NetworkFunctionVirtualisation(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
nfv=nfv,
nfv_specifics=converted_specifics
)
db.session.merge(new_entry)
db.session.commit()
def transfer_monitoring_tools(nren_dict):
# for this data, nothing comes from the excel publisher, so we can delete all
db.session.execute(delete(presentation_models.MonitoringTools).where(
presentation_models.MonitoringTools.year <= 2022)
)
description_map = mapping.VALUE_TO_CODE_MAPPING.get(16672)
tools = recursive_query(16672)
tools = {(nren_name, year): json.loads(answer) for answer_id, nren_name, year, answer in tools}
tools_comment = recursive_query(16673)
tools_comment = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in tools_comment}
netflow = recursive_query(16674)
netflow = {(nren_name, year): answer.strip('" ') for answer_id, nren_name, year, answer in netflow}
for nren_name, year in tools.keys() | tools_comment.keys() | netflow.keys():
if nren_name not in nren_dict:
logger.warning(f'{nren_name} unknown. Skipping.')
continue
tool_descriptions = tools.get((nren_name, year), [])
comment = tools_comment.get((nren_name, year), "").replace("-", "")
if comment:
tool_descriptions.append(comment)
if "Other" in tool_descriptions:
tool_descriptions.remove("Other")
if "Other " in tool_descriptions:
tool_descriptions.remove("Other ")
converted_descriptions = []
for description in tool_descriptions:
converted_descriptions.append(description_map.get(description, description))
new_entry = presentation_models.MonitoringTools(
nren=nren_dict[nren_name],
nren_id=nren_dict[nren_name].id,
year=year,
tool_descriptions=converted_descriptions,
netflow_processing_description=netflow.get((nren_name, year), "")
)
db.session.merge(new_entry)
db.session.commit()
def _cli(app):
with app.app_context():
nren_dict = helpers.get_uppercase_nren_dict()
transfer_budget(nren_dict)
transfer_funding_sources(nren_dict)
transfer_staff_data(nren_dict)
transfer_nren_parent_org(nren_dict)
transfer_nren_sub_org(nren_dict)
transfer_charging_structure(nren_dict)
transfer_ec_projects(nren_dict)
transfer_policies(nren_dict)
transfer_institutions_urls(nren_dict)
transfer_central_procurement(nren_dict)
transfer_service_management(nren_dict)
transfer_service_user_types(nren_dict)
transfer_standards(nren_dict)
transfer_crisis_exercises(nren_dict)
transfer_security_controls(nren_dict)
transfer_eosc_listings(nren_dict)
transfer_commercial_connectivity(nren_dict)
transfer_commercial_charging_level(nren_dict)
transfer_fibre_light(nren_dict)
transfer_network_map_urls(nren_dict)
transfer_monitoring_tools(nren_dict)
transfer_traffic_statistics(nren_dict)
transfer_siem_vendors(nren_dict)
transfer_certificate_providers(nren_dict)
transfer_weather_map(nren_dict)
transfer_pert_team(nren_dict)
transfer_alien_wave(nren_dict)
transfer_external_connections(nren_dict)
# traffic ratio was freeform text so we don't transfer it
transfer_network_function_virtualisation(nren_dict)
transfer_network_automation(nren_dict)
@click.command()
@click.option('--config', type=click.STRING, default='config.json')
def cli(config):
app_config = load(open(config, 'r'))
app_config['SQLALCHEMY_BINDS'] = {survey_model.SURVEY_DB_BIND: app_config['SURVEY_DATABASE_URI']}
app = compendium_v2._create_app_with_db(app_config)
_cli(app)
if __name__ == "__main__":
cli()
......@@ -33,6 +33,8 @@ setup(
'conversion=compendium_v2.conversion.conversion:cli',
'dump_survey_model=compendium_v2.migrations.dump_survey_model:cli',
'legacy-survey-publisher=compendium_v2.publishers.legacy_publisher.survey_publisher_legacy:cli',
'excel-survey-publisher=compendium_v2.publishers.survey_publisher_legacy_excel:cli',
'db-publisher-2022=compendium_v2.publishers.survey_publisher_old_db_2022:cli',
]
},
license='MIT',
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment