parse_excel_data.py

import openpyxl
import os
import logging

from compendium_v2.db.model import FeeType
from compendium_v2.environment import setup_logging

setup_logging()

logger = logging.getLogger(__name__)

EXCEL_FILE = os.path.join(
    os.path.dirname(__file__), "xlsx",
    "2021_Organisation_DataSeries.xlsx")


def fetch_budget_excel_data():
    # load the xlsx file
    sheet_name = "1. Budget"
    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)

    # select the active worksheet
    ws = wb[sheet_name]
    # iterate over the rows in the worksheet
    for row in range(14, 57):
        for col in range(3, 8):
            # extract the data from the row
            nren = ws.cell(row=row, column=2).value
            budget = ws.cell(row=row, column=col).value
            year = ws.cell(row=13, column=col).value

            if budget is not None:
                budget = round(budget / 1000000, 2)
                if budget > 200:
                    logger.info(
                        f'{nren} has budget set to '
                        f'>200M EUR for {year}. ({budget})')

                # process the data (e.g. save to database)
                # print(f"NREN: {nren}, Budget: {budget}, Year: {year}")

                yield nren.upper(), budget, year


def fetch_funding_excel_data():
    # load the xlsx file
    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)

    # select the active worksheet
    sheet_name = "2. Income Sources"
    ws = wb[sheet_name]

    def hard_number_convert(s, source_name, nren, year):
        if s is None:
            logger.info(
                f'Invalid Value :{nren} has empty value for {source_name}.'
                + f'for year ({year})')
            return float(0)
        """ Returns True if string is a number. """
        try:
            return float(s)
        except ValueError:
            logger.info(
                f'Invalid Value :{nren} has empty value for {source_name}.'
                + f'for year ({year}) with value ({s})')
            return float(0)

    # iterate over the rows in the worksheet
    def create_points_for_year_until_2017(start_row, end_row, year, col_nren, col_start):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=col_nren).value
            client_institution = ws.cell(row=row, column=col_start).value
            commercial = ws.cell(row=row, column=col_start + 1).value
            geant_subsidy = ws.cell(row=row, column=col_start + 2).value
            gov_public_bodies = ws.cell(row=row, column=col_start + 3).value
            other_european_funding = ws.cell(row=row, column=col_start + 4).value
            other = ws.cell(row=row, column=col_start + 5).value
            print(nren, client_institution, commercial, geant_subsidy, gov_public_bodies, other_european_funding, other)

            client_institution = hard_number_convert(client_institution, "client institution", nren, year)
            commercial = hard_number_convert(commercial, "commercial", nren, year)
            geant_subsidy = hard_number_convert(geant_subsidy, "geant subsidy", nren, year)
            gov_public_bodies = hard_number_convert(gov_public_bodies, "gov/public_bodies", nren, year)
            other_european_funding = hard_number_convert(other_european_funding, "other european funding", nren, year)
            other = hard_number_convert(other, "other", nren, year)

            european_funding = geant_subsidy + other_european_funding

            # process the data (e.g. save to database)
            if nren is not None:
                yield (nren.upper(), year, client_institution,
                       european_funding,
                       gov_public_bodies,
                       commercial, other)

    def create_points_for_year_from_2018(start_row, end_row, year, col_start):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=col_start).value
            client_institution = ws.cell(row=row, column=col_start + 3).value
            european_funding = ws.cell(row=row, column=col_start + 4).value
            gov_public_bodies = ws.cell(row=row, column=col_start + 5).value
            commercial = ws.cell(row=row, column=col_start + 6).value
            other = ws.cell(row=row, column=col_start + 7).value

            client_institution = hard_number_convert(client_institution, "client institution", nren, year)
            european_funding = hard_number_convert(european_funding, "european funding", nren, year)
            gov_public_bodies = hard_number_convert(gov_public_bodies, "gov/public_bodies", nren, year)
            commercial = hard_number_convert(commercial, "commercial", nren, year)
            other = hard_number_convert(other, "other", nren, year)

            # process the data (e.g. save to database)
            if nren is not None:
                yield (nren.upper(), year, client_institution,
                       european_funding,
                       gov_public_bodies,
                       commercial, other)

    # For 2016
    yield from create_points_for_year_until_2017(8, 50, 2016, 43, 45)

    # For 2017
    yield from create_points_for_year_until_2017(8, 50, 2017, 32, 35)

    # For 2018
    yield from create_points_for_year_from_2018(8, 50, 2018, 21)

    # For 2019
    yield from create_points_for_year_from_2018(8, 50, 2019, 12)

    # For 2020
    yield from create_points_for_year_from_2018(8, 50, 2020, 3)


def fetch_charging_structure_excel_data():
    # load the xlsx file
    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)

    # select the active worksheet
    sheet_name = "3. Charging mechanism"
    ws = wb[sheet_name]

    # iterate over the rows in the worksheet
    def create_points_for_year(start_row, end_row, year, col_start):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=col_start).value
            charging_structure = ws.cell(row=row, column=col_start + 1).value
            logger.info(
                f'NREN: {nren}, Charging Structure: {charging_structure},'
                f' Year: {year}')
            if charging_structure is not None:
                if "do not charge" in charging_structure:
                    charging_structure = FeeType.no_charge.value
                elif "combination" in charging_structure:
                    charging_structure = FeeType.combination.value
                elif "flat" in charging_structure:
                    charging_structure = FeeType.flat_fee.value
                elif "usage-based" in charging_structure:
                    charging_structure = FeeType.usage_based_fee.value
                elif "Other" in charging_structure:
                    charging_structure = FeeType.other.value
                else:
                    charging_structure = None

                logger.info(
                    f'NREN: {nren}, Charging Structure: {charging_structure},'
                    f' Year: {year}')

                yield nren.upper(), year, charging_structure

    # For 2021
    yield from create_points_for_year(3, 45, 2021, 2)

    # For 2019
    yield from create_points_for_year(3, 45, 2019, 6)


def fetch_staffing_excel_data():
    # load the xlsx file
    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)

    # select the active worksheet
    sheet_name = "4. Staff"
    ws = wb[sheet_name]

    start_row = 18
    end_row = 61

    def convert_number(value, nren, year, description):
        try:
            return float(value)
        except (TypeError, ValueError):
            logger.info(f'NREN: {nren} year: {year} has {value} for {description}; set to 0.')
            return 0

    def create_points_for_year(year, nren_column, start_column):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=nren_column).value
            permanent = ws.cell(row=row, column=start_column).value
            permanent = convert_number(permanent, nren, year, "permanent ftes")
            subcontracted = ws.cell(row=row, column=start_column + 1).value
            subcontracted = convert_number(subcontracted, nren, year, "subcontractor ftes")
            if permanent + subcontracted > 0:
                yield nren.upper(), year, permanent, subcontracted

    # For 2016
    yield from create_points_for_year(2016, 53, 55)

    # For 2017
    yield from create_points_for_year(2017, 43, 46)

    # For 2018
    yield from create_points_for_year(2018, 33, 36)

    # For 2019
    yield from create_points_for_year(2019, 23, 26)

    # For 2020
    yield from create_points_for_year(2020, 13, 16)

    # For 2021
    yield from create_points_for_year(2021, 2, 5)


def fetch_staff_function_excel_data():
    # load the xlsx file
    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)

    # select the active worksheet
    sheet_name = "5. Staff by Function"
    ws = wb[sheet_name]

    start_row = 14
    end_row = 58

    def convert_number(value, nren, year, description):
        try:
            return float(value)
        except (TypeError, ValueError):
            logger.info(f'NREN: {nren} year: {year} has {value} for {description}; set to 0.')
            return 0

    def read_cell_number(row, column, nren, year, description):
        value = ws.cell(row=row, column=column).value
        return convert_number(value, nren, year, description)

    def create_points_for_year_until_2019(year, nren_column, start_column):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=nren_column).value
            if nren is None:
                continue
            admin = read_cell_number(row, start_column, nren, year, "admin and finance ftes")
            communication = read_cell_number(row, start_column + 1, nren, year, "communication ftes")
            infosec = read_cell_number(row, start_column + 2, nren, year, "infosec ftes")
            it = read_cell_number(row, start_column + 3, nren, year, "it and software dev ftes")
            noc = read_cell_number(row, start_column + 4, nren, year, "NOC and engineering ftes")
            others = read_cell_number(row, start_column + 5, nren, year, "other ftes")

            technical = infosec + it + noc
            non_technical = admin + communication + others

            if technical + non_technical > 0:
                yield nren.upper(), year, technical, non_technical

    def create_points_for_year(year, nren_column, start_column):
        for row in range(start_row, end_row):
            # extract the data from the row
            nren = ws.cell(row=row, column=nren_column).value
            if nren is None:
                continue
            technical = read_cell_number(row, start_column, nren, year, "technical ftes")
            non_technical = read_cell_number(row, start_column + 1, nren, year, "non-technical ftes")
            if technical + non_technical > 0:
                yield nren.upper(), year, technical, non_technical

    # For 2017
    yield from create_points_for_year_until_2019(2017, 41, 43)

    # For 2018
    yield from create_points_for_year_until_2019(2018, 31, 33)

    # For 2019
    yield from create_points_for_year_until_2019(2019, 20, 22)

    # For 2020
    yield from create_points_for_year(2020, 12, 14)

    # For 2021
    yield from create_points_for_year(2021, 3, 5)