Skip to content
Snippets Groups Projects
Commit 40116778 authored by Bjarke Madsen's avatar Bjarke Madsen
Browse files

Add survey-publisher-legacy script for converting old response data into new...

Add survey-publisher-legacy script for converting old response data into new surveys (reusing 2024 survey template)
parent 7a38e9fd
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,12 @@
All notable changes to this project will be documented in this file.
## [0.78] - 2025-01-13
- COMP-371: Add S&P - EOSC Listings page (unlisted due to bad data)
- Only render http(s) links as links in URL tables
- Get rid of redundant collapsible-column css & fix up some css
- Add survey-publisher-legacy script for converting old response data into new surveys (reusing 2024 survey template)
## [0.77] - 2025-01-10
- COMP-369: Add Network - Capacity - External IP Connections page
......
import click
from itertools import chain
from collections import defaultdict
from typing import Dict, Any
import compendium_v2
from compendium_v2.db import db
from compendium_v2.config import load
from compendium_v2.db.presentation_models import NREN
from compendium_v2.survey_db import model as survey_model
from compendium_v2.db.survey_models import SurveyResponse, Survey, SurveyStatus, ResponseStatus, SurveyNotes
from compendium_v2.publishers.legacy_publisher.survey_publisher_legacy_db import fetch_data as fetch_data_db
from compendium_v2.publishers.legacy_publisher.survey_publisher_legacy_excel import fetch_data as fetch_data_excel
def insert_survey_data(survey_2024: Survey, nren: NREN, year: int, answer: Dict[str, Any]):
# we're basing the generated survey on the 2024 survey, so we need to make sure that exists
# before we insert the responses.
survey = db.session.query(Survey).filter(Survey.year == year).first()
if not survey:
survey = Survey(
year=year,
survey=survey_2024.survey,
status=SurveyStatus.published
)
db.session.add(survey)
response = db.session.query(SurveyResponse).filter(SurveyResponse.survey_year ==
year, SurveyResponse.nren_id == nren.id).first()
if not response:
# add some default values for the survey
answer['page'] = 1
answer['verification_status'] = {}
response = SurveyResponse(
survey=survey,
survey_year=year,
nren_id=nren.id,
nren=nren,
answers=answer,
status=ResponseStatus.completed
)
response_notes = SurveyNotes(
survey=response,
survey_year=year,
nren_id=nren.id,
notes="This survey has been imported by the legacy survey importer. Please review the data for accuracy.",
)
db.session.add(response)
db.session.add(response_notes)
db.session.commit()
def delete_surveys():
db.session.query(SurveyNotes).filter(SurveyNotes.survey_year >= 2016, SurveyNotes.survey_year <= 2021).delete()
db.session.query(SurveyResponse).filter(SurveyResponse.survey_year >=
2016, SurveyResponse.survey_year <= 2021).delete()
db.session.query(Survey).filter(Survey.year >= 2016, Survey.year <= 2021).delete()
db.session.commit()
def ensure_string_tree(tree: Dict[str, Any]):
# this function converts all object values to strings (from int, float, etc)
# if we encounter lists or dicts, we recurse into them.
for key, value in list(tree.items()):
if isinstance(value, dict):
ensure_string_tree(value)
elif isinstance(value, list):
for i, val in list(enumerate(value)):
if isinstance(val, dict):
ensure_string_tree(val)
else:
value[i] = str(val)
else:
if value is None:
del tree[key]
else:
tree[key] = str(value)
return tree
@click.command()
@click.option('--config', type=click.STRING, default='config.json')
def cli(config):
app_config = load(open(config, 'r'))
app_config['SQLALCHEMY_BINDS'] = {survey_model.SURVEY_DB_BIND: app_config['SURVEY_DATABASE_URI']}
app = compendium_v2._create_app_with_db(app_config)
print("survey-publisher-legacy starting")
with app.app_context():
all_responses = db.session.query(SurveyResponse).filter(SurveyResponse.survey_year == 2024).all()
data = [resp.answers['data'] for resp in all_responses if 'data' in resp.answers]
valid_keys = set(chain(*[a.keys() for a in data]))
nren_map = defaultdict(lambda: defaultdict(lambda: {'data': {}}))
for data_key, nren, nren_id, year, value in chain(fetch_data_excel(), fetch_data_db()):
if data_key not in valid_keys:
print(f'Invalid data key: {data_key} for NREN: {nren} ({nren_id}) in year {year}')
nren_map[nren][year]['data'][data_key] = value
# use this to gauge quality of a survey year:
# answers = [len(d['data']) for yearmap in nren_map.values() for year, d in yearmap.items() if year == 2018]
# sum(answers) / len(answers)
delete_surveys()
survey_2024 = db.session.query(Survey).filter(Survey.year == 2024).first()
for nren, years in nren_map.items():
for year, data in years.items():
if year < 2016 or year > 2021:
# data before 2016 is very sparse, so don't move it over
# we already have 2022 and above, so don't port those.
continue
data = ensure_string_tree(data)
insert_survey_data(survey_2024, nren, year, data)
if __name__ == "__main__":
cli()
This diff is collapsed.
"""
survey_publisher_v1
=========================
This module loads the survey data from before 2022 from a legacy Excel files.
Missing info is filled in from the survey db for some questions.
Registered as click cli command when installing compendium-v2.
"""
from __future__ import annotations
import itertools
from sqlalchemy import select
from collections import defaultdict
from compendium_v2.db import db
from compendium_v2.publishers import helpers, excel_parser
from compendium_v2.survey_db import model as survey_model
def budget(nren_dict):
nren_by_id = {nren.id: nren for nren in nren_dict.values()}
data = db.session.scalars(select(survey_model.Nrens))
inserts = defaultdict(dict)
for nren in data:
for budget in nren.budgets:
abbrev = nren.abbreviation.upper()
year = budget.year
if abbrev not in nren_dict:
continue
budget_entry = {
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'budget': float(budget.budget),
'year': year
}
inserts[nren_dict[abbrev].id][year] = budget_entry
# Import the data from excel sheet to database
exceldata = excel_parser.fetch_budget_excel_data()
for abbrev, budget, year in exceldata:
if abbrev not in nren_dict:
continue
budget_entry = {
'nren': nren_dict[abbrev],
'nren_id': nren_dict[abbrev].id,
'budget': budget,
'year': year
}
inserts[nren_dict[abbrev].id][year] = budget_entry
for nren_id, year_data in inserts.items():
for year, budget_entry in year_data.items():
nren = nren_by_id[nren_id]
yield ('budget', nren, nren.id, year, budget_entry['budget'])
def funding(nren_dict):
data = excel_parser.fetch_funding_excel_data()
for (abbrev, year, client_institution,
european_funding,
gov_public_bodies,
commercial, other) in data:
if abbrev not in nren_dict:
continue
_data = {
'client_institutions': client_institution,
'european_funding': european_funding,
'commercial': commercial,
'other': other,
'gov_public_bodies': gov_public_bodies,
}
yield ('income_sources', nren_dict[abbrev], nren_dict[abbrev].id, year, _data)
def charging_structure(nren_dict):
data = excel_parser.fetch_charging_structure_excel_data()
for (abbrev, year, charging_structure) in data:
if abbrev not in nren_dict:
continue
if charging_structure:
charging_structure = charging_structure.name
yield ('charging_mechanism', nren_dict[abbrev], nren_dict[abbrev].id, year, charging_structure)
def staffing(nren_dict):
staff_data = list(excel_parser.fetch_staffing_excel_data())
nren_staff_map = {}
for (abbrev, year, permanent_fte, subcontracted_fte) in staff_data:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
nren_staff_map[(nren.id, year)] = {
'nren': nren,
'nren_id': nren.id,
'year': year,
'permanent_fte': permanent_fte,
'subcontracted_fte': subcontracted_fte,
'technical_fte': 0,
'non_technical_fte': 0
}
function_data = excel_parser.fetch_staff_function_excel_data()
for (abbrev, year, technical_fte, non_technical_fte) in function_data:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
if (nren.id, year) in nren_staff_map:
nren_staff_map[(nren.id, year)]['technical_fte'] = technical_fte
nren_staff_map[(nren.id, year)]['non_technical_fte'] = non_technical_fte
else:
nren_staff_map[(nren.id, year)] = {
'nren': nren,
'nren_id': nren.id,
'year': year,
'permanent_fte': 0,
'subcontracted_fte': 0,
'technical_fte': technical_fte,
'non_technical_fte': non_technical_fte
}
for nren_staff_model in nren_staff_map.values():
nren = nren_staff_model['nren']
year = nren_staff_model['year']
staff_type_data = {
'permanent_fte': nren_staff_model['permanent_fte'],
'subcontracted_fte': nren_staff_model['subcontracted_fte'],
}
staff_roles_data = {
'technical_fte': nren_staff_model['technical_fte'],
'non_technical_fte': nren_staff_model['non_technical_fte'],
}
yield ('staff_employment_type', nren, nren.id, year, staff_type_data)
yield ('staff_roles', nren, nren.id, year, staff_roles_data)
def ecprojects(nren_dict):
ecproject_data = excel_parser.fetch_ecproject_excel_data()
by_nren_year = defaultdict(lambda: defaultdict(set))
for (abbrev, year, project) in ecproject_data:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
by_nren_year[abbrev][year].add(project)
for abbrev, year_projects in by_nren_year.items():
nren = nren_dict[abbrev]
for year, projects in year_projects.items():
yield ('ec_project_names', nren, nren.id, year, list(projects))
def parent_organizations(nren_dict):
organization_data = excel_parser.fetch_organization_excel_data()
for (abbrev, year, org) in organization_data:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
yield ('parent_organization', nren, nren.id, year, 'Yes')
yield ('parent_organization_name', nren, nren.id, year, org)
def traffic_volume(nren_dict):
traffic_data = excel_parser.fetch_traffic_excel_data()
for (abbrev, year, from_external, to_external, from_customers, to_customers) in traffic_data:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
if nren.name == 'CESNET':
# COMP-447: correct CESNET traffic data for 2019
if year == 2019:
to_customers = 222766
yield ('traffic_estimate', nren, nren.id, year, {
'from_customers': from_customers,
'from_external': from_external,
'to_customers': to_customers,
'to_external': to_external
})
def connected_proportion(nren_dict):
remit = excel_parser.fetch_remit_excel_data()
nr_connected = excel_parser.fetch_nr_connected_excel_data()
market_share = excel_parser.fetch_market_share_excel_data()
users_served = excel_parser.fetch_users_served_excel_data()
data_by_nren = defaultdict(lambda: defaultdict(dict))
for key in itertools.chain(remit.keys(), nr_connected.keys(), market_share.keys(), users_served.keys()):
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
continue
covered = remit.get(key)
if covered:
covered = covered.name
market_share_percentage = market_share.get(key)
_nr_connected = nr_connected.get(key)
nr_of_users = users_served.get(key)
result = {}
if covered:
result['covered'] = covered
if market_share_percentage:
result['market_share_percentage'] = market_share_percentage
if _nr_connected:
result['nr_connected'] = _nr_connected
if nr_of_users:
result['nr_of_users'] = nr_of_users
existing = data_by_nren[abbrev][year].get(user_category)
if existing:
existing.update(result)
else:
data_by_nren[abbrev][year][user_category] = result
for abbrev, data in data_by_nren.items():
for year, category_data in data.items():
connectivity_proportions = {}
for user_category, user_data in category_data.items():
connectivity_proportions[user_category.name] = user_data
yield ('connectivity_proportions', nren_dict[abbrev], nren_dict[abbrev].id, year, connectivity_proportions)
def connectivity_level(nren_dict):
typical_speeds = excel_parser.fetch_typical_speed_excel_data()
highest_speeds = excel_parser.fetch_highest_speed_excel_data()
highest_speed_proportions = excel_parser.fetch_highest_speed_proportion_excel_data()
data_by_nren = defaultdict(lambda: defaultdict(dict))
for key in itertools.chain(typical_speeds.keys(), highest_speeds.keys(), highest_speed_proportions.keys()):
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
continue
typical_speed = typical_speeds.get(key)
highest_speed = highest_speeds.get(key)
highest_speed_connection_percentage = highest_speed_proportions.get(key)
result = {}
if typical_speed:
result['typical_speed'] = typical_speed
if highest_speed:
result['highest_speed'] = highest_speed
if highest_speed_connection_percentage:
result['highest_speed_connection_percentage'] = highest_speed_connection_percentage
existing = data_by_nren[abbrev][year].get(user_category)
if existing:
existing.update(result)
else:
data_by_nren[abbrev][year][user_category] = result
for abbrev, data in data_by_nren.items():
for year, category_data in data.items():
connectivity_level = {}
for user_category, user_data in category_data.items():
connectivity_level[user_category.name] = user_data
yield ('connectivity_level', nren_dict[abbrev], nren_dict[abbrev].id, year, connectivity_level)
def connection_carrier(nren_dict):
carriers = excel_parser.fetch_carriers_excel_data()
data_by_nren = defaultdict(lambda: defaultdict(dict))
for key, carry_mechanism in carriers.items():
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
continue
result = {}
if carry_mechanism:
result['carry_mechanism'] = carry_mechanism.name
existing = data_by_nren[abbrev][year].get(user_category)
if existing:
existing.update(result)
else:
data_by_nren[abbrev][year][user_category] = result
for abbrev, data in data_by_nren.items():
for year, category_data in data.items():
traffic_carriers = {}
for user_category, user_data in category_data.items():
traffic_carriers[user_category.name] = user_data
yield ('traffic_carriers', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_carriers)
def connectivity_growth(nren_dict):
growth = excel_parser.fetch_growth_excel_data()
data_by_nren = defaultdict(lambda: defaultdict(dict))
for key, growth_percent in growth.items():
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
continue
result = {}
if growth_percent:
result['growth_rate'] = growth_percent
existing = data_by_nren[abbrev][year].get(user_category)
if existing:
existing.update(result)
else:
data_by_nren[abbrev][year][user_category] = result
for abbrev, data in data_by_nren.items():
for year, category_data in data.items():
traffic_growth = {}
for user_category, user_data in category_data.items():
traffic_growth[user_category.name] = user_data
yield ('traffic_growth', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_growth)
def connectivity_load(nren_dict):
averages = excel_parser.fetch_average_traffic_excel_data()
peaks = excel_parser.fetch_peak_traffic_excel_data()
all_entry_keys = set()
all_entry_keys.update(averages.keys())
all_entry_keys.update(peaks.keys())
data_by_nren = defaultdict(lambda: defaultdict(dict))
for key in all_entry_keys:
(abbrev, year, user_category) = key
if abbrev not in nren_dict:
continue
result = {}
average = averages.get(key, (None, None))
peak = peaks.get(key, (None, None))
average_from_institutions_to_network = average[0]
average_to_institutions_from_network = average[1]
peak_from_institutions_to_network = peak[0]
peak_to_institutions_from_network = peak[1]
if average_from_institutions_to_network:
result['average_from_institutions_to_network'] = average_from_institutions_to_network
if average_to_institutions_from_network:
result['average_to_institutions_from_network'] = average_to_institutions_from_network
if peak_from_institutions_to_network:
result['peak_from_institutions_to_network'] = peak_from_institutions_to_network
if peak_to_institutions_from_network:
result['peak_to_institutions_from_network'] = peak_to_institutions_from_network
existing = data_by_nren[abbrev][year].get(user_category)
if existing:
existing.update(result)
else:
data_by_nren[abbrev][year][user_category] = result
for abbrev, data in data_by_nren.items():
for year, category_data in data.items():
traffic_load = {}
for user_category, user_data in category_data.items():
traffic_load[user_category.name] = user_data
yield ('traffic_load', nren_dict[abbrev], nren_dict[abbrev].id, year, traffic_load)
def remote_campuses(nren_dict):
campuses = excel_parser.fetch_remote_campuses_excel_data()
for (abbrev, year, connectivity, country, connected_to_r_e) in campuses:
if abbrev not in nren_dict:
continue
if connectivity or country:
yield ('remote_campuses', nren_dict[abbrev], nren_dict[abbrev].id, year, "Yes")
if country:
yield ('remote_campuses_specifics', nren_dict[abbrev], nren_dict[abbrev].id, year, [
{'country': country, 'connected': connected_to_r_e}
])
def dark_fibre_lease(nren_dict):
data_rows = excel_parser.fetch_dark_fibre_iru_excel_data()
iru_durations = excel_parser.fetch_iru_duration_excel_data()
for (abbrev, year, iru, length_in_country, length_out_country) in data_rows:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
dark_fibre_lease = iru
dark_fibre_lease_duration = iru_durations.get((abbrev, year))
if dark_fibre_lease or dark_fibre_lease_duration or length_in_country or length_out_country:
yield ('dark_fibre_lease', nren, nren.id, year, "Yes")
else:
return
if dark_fibre_lease_duration:
yield ('dark_fibre_lease_duration', nren, nren.id, year, dark_fibre_lease_duration)
if length_in_country:
yield ('dark_fibre_lease_kilometers_inside_country', nren, nren.id, year, length_in_country)
if length_out_country:
yield ('dark_fibre_lease_kilometers_outside_country', nren, nren.id, year, length_out_country)
def dark_fibre_installed(nren_dict):
data_rows = excel_parser.fetch_dark_fibre_installed_excel_data()
for (abbrev, year, installed, length) in data_rows:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
if installed or length:
yield ('dark_fibre_nren', nren, nren.id, year, "Yes")
if length:
yield ('dark_fibre_nren_kilometers_inside_country', nren, nren.id, year, length)
def passive_monitoring(nren_dict):
data_rows = excel_parser.fetch_passive_monitoring_excel_data()
for (abbrev, year, monitoring, method) in data_rows:
if abbrev not in nren_dict:
continue
if monitoring or method:
yield ('passive_monitoring', nren_dict[abbrev], nren_dict[abbrev].id, year, "Yes")
if method:
yield ('passive_monitoring_tech', nren_dict[abbrev], nren_dict[abbrev].id, year, method.name)
def capacity(nren_dict):
largest_data_rows = excel_parser.fetch_largest_link_capacity_excel_data()
typical_data_rows = excel_parser.fetch_typical_backbone_capacity_excel_data()
by_nren = defaultdict(dict)
for key in itertools.chain(largest_data_rows.keys(), typical_data_rows.keys()):
(abbrev, year) = key
if abbrev not in nren_dict:
continue
to_add = (abbrev, year)
by_nren[to_add].update({
'max_capacity': largest_data_rows.get(key, by_nren[to_add].get('max_capacity')),
'typical_capacity': typical_data_rows.get(key, by_nren[to_add].get('typical_capacity'))
})
for (abbrev, year), data in by_nren.items():
max_capacity = data.get('max_capacity')
typical_capacity = data.get('typical_capacity')
if max_capacity:
yield ('max_capacity', nren_dict[abbrev], nren_dict[abbrev].id, year, max_capacity)
if typical_capacity:
yield ('typical_capacity', nren_dict[abbrev], nren_dict[abbrev].id, year, typical_capacity)
def non_r_e_peers(nren_dict):
data_rows = excel_parser.fetch_non_r_e_peers_excel_data()
for (abbrev, year, nr_of_non_r_and_e_peers) in data_rows:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
if nr_of_non_r_and_e_peers:
yield ('non_r_and_e_peers', nren, nren.id, year, nr_of_non_r_and_e_peers)
def ops_automation(nren_dict):
data_rows = excel_parser.fetch_ops_automation_excel_data()
for (abbrev, year, automation, specifics) in data_rows:
if abbrev not in nren_dict:
continue
nren = nren_dict[abbrev]
if automation or specifics:
yield ('operational_process_automation', nren, nren.id, year, "Yes")
if specifics:
yield ('operational_process_automation_tools', nren, nren.id, year, specifics)
def fetch_data():
# requires being in a flask app context when called
nren_dict = helpers.get_uppercase_nren_dict()
yield from budget(nren_dict)
yield from funding(nren_dict)
yield from charging_structure(nren_dict)
yield from staffing(nren_dict)
yield from ecprojects(nren_dict)
yield from parent_organizations(nren_dict)
yield from traffic_volume(nren_dict)
yield from connected_proportion(nren_dict)
yield from connectivity_level(nren_dict)
yield from connection_carrier(nren_dict)
yield from connectivity_growth(nren_dict)
yield from connectivity_load(nren_dict)
yield from remote_campuses(nren_dict)
yield from dark_fibre_lease(nren_dict)
yield from dark_fibre_installed(nren_dict)
yield from passive_monitoring(nren_dict)
yield from capacity(nren_dict)
yield from non_r_e_peers(nren_dict)
yield from ops_automation(nren_dict)
......@@ -30,10 +30,11 @@ setup(
include_package_data=True,
entry_points={
'console_scripts': [
'excel-survey-publisher=compendium_v2.publishers.survey_publisher_legacy_excel:cli', # noqa
'db-publisher-2022=compendium_v2.publishers.survey_publisher_old_db_2022:cli', # noqa
'conversion=compendium_v2.conversion.conversion:cli', # noqa
'dump_survey_model=compendium_v2.migrations.dump_survey_model:cli', # noqa
'excel-survey-publisher=compendium_v2.publishers.survey_publisher_legacy_excel:cli',
'db-publisher-2022=compendium_v2.publishers.survey_publisher_old_db_2022:cli',
'conversion=compendium_v2.conversion.conversion:cli',
'dump_survey_model=compendium_v2.migrations.dump_survey_model:cli',
'legacy-survey-publisher=compendium_v2.publishers.legacy_publisher.survey_publisher_legacy:cli',
]
},
license='MIT',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment