Select Git revision
parse_excel_data.py
Remco Tukker authored
parse_excel_data.py 15.55 KiB
import logging
import openpyxl
import os
from compendium_v2.db.model import FeeType
from compendium_v2.environment import setup_logging
setup_logging()
logger = logging.getLogger(__name__)
EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2021_Organisation_DataSeries.xlsx")
NETWORK_EXCEL_FILE = os.path.join(os.path.dirname(__file__), "xlsx", "2022_Networks_DataSeries.xlsx")
def fetch_budget_excel_data():
# load the xlsx file
sheet_name = "1. Budget"
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
ws = wb[sheet_name]
# iterate over the rows in the worksheet
for row in range(14, 58):
for col in range(3, 9):
# extract the data from the row
nren = ws.cell(row=row, column=2).value
budget = ws.cell(row=row, column=col).value
year = ws.cell(row=13, column=col).value
if budget is not None:
budget = round(budget / 1000000, 2)
yield nren.upper(), budget, year
def fetch_funding_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "2. Income Sources"
ws = wb[sheet_name]
ws2 = wb["Income Sources table"]
def hard_number_convert(s, source_name, nren, year):
if s is None:
logger.info(f'Invalid Value :{nren} has empty value for {source_name} for year ({year})')
return float(0)
try:
return float(s)
except ValueError:
logger.info(f'Invalid Value :{nren} has empty value for {source_name} for year ({year}) with value ({s})')
return float(0)
# iterate over the rows in the worksheet
def create_points_for_year_until_2017(start_row, end_row, year, col_nren, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=col_nren).value
client_institution = ws.cell(row=row, column=col_start).value
commercial = ws.cell(row=row, column=col_start + 1).value
geant_subsidy = ws.cell(row=row, column=col_start + 2).value
gov_public_bodies = ws.cell(row=row, column=col_start + 3).value
other_european_funding = ws.cell(row=row, column=col_start + 4).value
other = ws.cell(row=row, column=col_start + 5).value
client_institution = hard_number_convert(client_institution, "client institution", nren, year)
commercial = hard_number_convert(commercial, "commercial", nren, year)
geant_subsidy = hard_number_convert(geant_subsidy, "geant subsidy", nren, year)
gov_public_bodies = hard_number_convert(gov_public_bodies, "gov/public_bodies", nren, year)
other_european_funding = hard_number_convert(other_european_funding, "other european funding", nren, year)
other = hard_number_convert(other, "other", nren, year)
european_funding = geant_subsidy + other_european_funding
# process the data (e.g. save to database)
if nren is not None:
yield (nren.upper(), year, client_institution, european_funding, gov_public_bodies, commercial, other)
def create_points_for_year_from_2018(sheet, start_row, end_row, year, nren_col, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = sheet.cell(row=row, column=nren_col).value
client_institution = sheet.cell(row=row, column=col_start).value
european_funding = sheet.cell(row=row, column=col_start + 1).value
gov_public_bodies = sheet.cell(row=row, column=col_start + 2).value
commercial = sheet.cell(row=row, column=col_start + 3).value
other = sheet.cell(row=row, column=col_start + 4).value
client_institution = hard_number_convert(client_institution, "client institution", nren, year)
european_funding = hard_number_convert(european_funding, "european funding", nren, year)
gov_public_bodies = hard_number_convert(gov_public_bodies, "gov/public_bodies", nren, year)
commercial = hard_number_convert(commercial, "commercial", nren, year)
other = hard_number_convert(other, "other", nren, year)
# process the data (e.g. save to database)
if nren is not None:
yield (nren.upper(), year, client_institution, european_funding, gov_public_bodies, commercial, other)
# For 2016
yield from create_points_for_year_until_2017(8, 51, 2016, 43, 45)
# For 2017
yield from create_points_for_year_until_2017(8, 51, 2017, 32, 35)
# For 2018
yield from create_points_for_year_from_2018(ws, 8, 51, 2018, 21, 24)
# For 2019
yield from create_points_for_year_from_2018(ws, 8, 51, 2019, 12, 15)
# For 2020
yield from create_points_for_year_from_2018(ws, 8, 51, 2020, 3, 6)
# For 2021
yield from create_points_for_year_from_2018(ws2, 8, 51, 2021, 11, 12)
def fetch_charging_structure_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "3. Charging mechanism"
ws = wb[sheet_name]
# iterate over the rows in the worksheet
def create_points_for_2021(start_row, end_row, year, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=col_start).value
charging_structure = ws.cell(row=row, column=col_start + 1).value
logger.info(f'NREN: {nren}, Charging Structure: {charging_structure}, Year: {year}')
if charging_structure is not None:
if "do not charge" in charging_structure:
charging_structure = FeeType.no_charge
elif "combination" in charging_structure:
charging_structure = FeeType.combination
elif "flat" in charging_structure:
charging_structure = FeeType.flat_fee
elif "usage-based" in charging_structure:
charging_structure = FeeType.usage_based_fee
elif "Other" in charging_structure:
charging_structure = FeeType.other
else:
charging_structure = None
logger.info(f'NREN: {nren}, Charging Structure: {charging_structure}, Year: {year}')
yield nren.upper(), year, charging_structure
def create_points_for_2019(start_row, end_row, year, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=col_start).value
charging_structure = ws.cell(row=row, column=col_start + 1).value
structure_2 = ws.cell(row=row, column=col_start + 2).value
logger.info(f'NREN: {nren}, Charging Structure: {charging_structure}, {structure_2}, Year: {year}')
if charging_structure is not None:
if structure_2 not in [None, '']:
charging_structure = FeeType.other
elif "do not charge" in charging_structure:
charging_structure = FeeType.no_charge
elif "combination" in charging_structure:
charging_structure = FeeType.combination
elif "flat" in charging_structure:
charging_structure = FeeType.flat_fee
elif "usage-based" in charging_structure:
charging_structure = FeeType.usage_based_fee
elif "Other" in charging_structure:
charging_structure = FeeType.other
else:
charging_structure = None
logger.info(f'NREN: {nren}, Charging Structure: {charging_structure}, Year: {year}')
yield nren.upper(), year, charging_structure
# For 2021
yield from create_points_for_2021(3, 46, 2021, 2)
# For 2019
yield from create_points_for_2019(3, 46, 2019, 6)
def fetch_staffing_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "4. Staff"
ws = wb[sheet_name]
start_row = 18
end_row = 61
def convert_number(value, nren, year, description):
try:
return float(value)
except (TypeError, ValueError):
logger.info(f'NREN: {nren} year: {year} has {value} for {description}; set to 0.')
return 0
def create_points_for_year(year, nren_column, start_column):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=nren_column).value
permanent = ws.cell(row=row, column=start_column).value
permanent = convert_number(permanent, nren, year, "permanent ftes")
subcontracted = ws.cell(row=row, column=start_column + 1).value
subcontracted = convert_number(subcontracted, nren, year, "subcontractor ftes")
if permanent + subcontracted > 0:
yield nren.upper(), year, permanent, subcontracted
# For 2016
yield from create_points_for_year(2016, 53, 55)
# For 2017
yield from create_points_for_year(2017, 43, 46)
# For 2018
yield from create_points_for_year(2018, 33, 36)
# For 2019
yield from create_points_for_year(2019, 23, 26)
# For 2020
yield from create_points_for_year(2020, 13, 16)
# For 2021
yield from create_points_for_year(2021, 2, 5)
def fetch_staff_function_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "5. Staff by Function"
ws = wb[sheet_name]
start_row = 14
end_row = 58
def convert_number(value, nren, year, description):
try:
return float(value)
except (TypeError, ValueError):
logger.info(f'NREN: {nren} year: {year} has {value} for {description}; set to 0.')
return 0
def read_cell_number(row, column, nren, year, description):
value = ws.cell(row=row, column=column).value
return convert_number(value, nren, year, description)
def create_points_for_year_until_2019(year, nren_column, start_column):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=nren_column).value
if nren is None:
continue
admin = read_cell_number(row, start_column, nren, year, "admin and finance ftes")
communication = read_cell_number(row, start_column + 1, nren, year, "communication ftes")
infosec = read_cell_number(row, start_column + 2, nren, year, "infosec ftes")
it = read_cell_number(row, start_column + 3, nren, year, "it and software dev ftes")
noc = read_cell_number(row, start_column + 4, nren, year, "NOC and engineering ftes")
others = read_cell_number(row, start_column + 5, nren, year, "other ftes")
technical = infosec + it + noc
non_technical = admin + communication + others
if technical + non_technical > 0:
yield nren.upper(), year, technical, non_technical
def create_points_for_year(year, nren_column, start_column):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=nren_column).value
if nren is None:
continue
technical = read_cell_number(row, start_column, nren, year, "technical ftes")
non_technical = read_cell_number(row, start_column + 1, nren, year, "non-technical ftes")
if technical + non_technical > 0:
yield nren.upper(), year, technical, non_technical
# For 2017
yield from create_points_for_year_until_2019(2017, 41, 43)
# For 2018
yield from create_points_for_year_until_2019(2018, 31, 33)
# For 2019
yield from create_points_for_year_until_2019(2019, 20, 22)
# For 2020
yield from create_points_for_year(2020, 12, 14)
# For 2021
yield from create_points_for_year(2021, 3, 5)
def fetch_ecproject_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "7. EC Projects"
ws = wb[sheet_name]
start_row = 6
def create_points_for_year(year, start_column, end_row):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=start_column).value
if nren is None:
continue
project = ws.cell(row=row, column=start_column + 1).value
if project is None:
continue
yield nren.upper(), year, project
yield from create_points_for_year(2017, 13, 165)
yield from create_points_for_year(2018, 10, 165)
yield from create_points_for_year(2019, 7, 165)
yield from create_points_for_year(2020, 4, 180)
yield from create_points_for_year(2021, 1, 173)
def fetch_organization_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "Organization"
ws = wb[sheet_name]
# iterate over the rows in the worksheet
for row in range(5, 48):
# extract the data from the row
nren = ws.cell(row=row, column=2).value
parent_org = ws.cell(row=row, column=4).value
if parent_org not in [None, 'NA', 'N/A']:
yield nren.upper(), 2021, parent_org
def fetch_traffic_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(NETWORK_EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "Estimated_Traffic TByte"
ws = wb[sheet_name]
rows = list(ws.rows)
def convert_number(value, nren, year, description):
if value is None or value == '--' or value == 'No data':
return 0
try:
return float(value)
except (TypeError, ValueError):
logger.info(f'NREN: {nren} year: {year} has {value} for {description}; set to 0.')
return 0
def create_points_for_year(year, start_column):
for i in range(6, 49):
nren_name = rows[i][start_column].value
if not nren_name:
continue
nren_name = nren_name.upper()
from_external = convert_number(rows[i][start_column + 1].value, nren_name, year, 'from_external')
to_external = convert_number(rows[i][start_column + 2].value, nren_name, year, 'to_external')
from_customer = convert_number(rows[i][start_column + 3].value, nren_name, year, 'from_customer')
to_customer = convert_number(rows[i][start_column + 4].value, nren_name, year, 'to_customer')
if from_external == 0 and to_external == 0 and from_customer == 0 and to_customer == 0:
continue
yield nren_name, year, from_external, to_external, from_customer, to_customer
yield from create_points_for_year(2016, 38)
yield from create_points_for_year(2017, 32)
yield from create_points_for_year(2018, 26)
yield from create_points_for_year(2019, 20)
yield from create_points_for_year(2020, 14)
yield from create_points_for_year(2021, 8)
yield from create_points_for_year(2022, 2)