diff --git a/compendium_v2/background_task/parse_excel_data.py b/compendium_v2/background_task/parse_excel_data.py new file mode 100644 index 0000000000000000000000000000000000000000..879a20edd24b496e9d4bebcb4a5149c647e0d64e --- /dev/null +++ b/compendium_v2/background_task/parse_excel_data.py @@ -0,0 +1,127 @@ +import openpyxl +import csv +import os +from compendium_v2 import db +from compendium_v2.db import model +from compendium_v2.environment import setup_logging +import logging + +setup_logging() + +logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task') + +EXCEL_FILE = os.path.join( + os.path.dirname(__file__), "xlsx", + "2021_Organisation_DataSeries.xlsx") + + +def import_countries(): + with db.session_scope() as session: + with open('csv/BudgetCsvFile.csv') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + data = model.BudgetEntry( + nren=row[0], budget=row[1], year=row[2]) + data.save() + session.add(data) + + +def fetch_budget_excel_data(): + + # load the xlsx file + sheet_name = "1. Budget" + wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + + # select the active worksheet + ws = wb[sheet_name] + # iterate over the rows in the worksheet + for row in range(14, 57): + for col in range(3, 8): + # extract the data from the row + nren = ws.cell(row=row, column=2).value + budget = ws.cell(row=row, column=col).value + year = ws.cell(row=13, column=col).value + + if budget is not None: + budget = round(budget / 1000000, 2) + if budget > 200: + logger.info( + f'{nren} has budget set to ' + f'>200M EUR for {year}. ({budget})') + + # process the data (e.g. save to database) + # print(f"NREN: {nren}, Budget: {budget}, Year: {year}") + + yield nren, budget, year + + +def fetch_funding_excel_data(): + + # load the xlsx file + wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True) + + # select the active worksheet + sheet_name = "2. Income Sources" + ws = wb[sheet_name] + + def hard_number_convert(s, source_name, nren, year): + if s is None: + logger.info( + f'Invalid Value :{nren} has empty value for {source_name}.' + + f'for year ({year})') + return float(0) + """ Returns True if string is a number. """ + try: + return float(s) + except ValueError: + logger.info( + f'Invalid Value :{nren} has empty value for {source_name}.' + + f'for year ({year}) with value ({s})') + return float(0) + + # iterate over the rows in the worksheet + + def create_points_for_year(start_row, end_row, year, col_start): + for row in range(start_row, end_row): + # extract the data from the row + nren = ws.cell(row=row, column=col_start).value + client_institution = ws.cell(row=row, + column=col_start + 3).value + european_funding = ws.cell(row=row, column=col_start + 4).value + gov_public_bodies = ws.cell(row=row, + column=col_start + 5).value + commercial = ws.cell(row=row, column=col_start + 6).value + other = ws.cell(row=row, column=col_start + 7).value + + client_institution = hard_number_convert( + client_institution, "client institution", nren, year) + european_funding = hard_number_convert( + european_funding, "european funding", nren, year) + gov_public_bodies = hard_number_convert( + gov_public_bodies, "gov/public_bodies", nren, year) + commercial = hard_number_convert( + commercial, "commercial", nren, year) + other = hard_number_convert( + other, "other", nren, year) + + # process the data (e.g. save to database) + if nren is not None: + yield (nren, year, client_institution, + european_funding, + gov_public_bodies, + commercial, other) + + # For 2016 + yield from create_points_for_year(8, 50, 2016, 43) + + # For 2017 + yield from create_points_for_year(8, 50, 2017, 32) + + # For 2018 + yield from create_points_for_year(8, 50, 2018, 21) + + # For 2019 + yield from create_points_for_year(8, 50, 2019, 12) + + # For 2020 + yield from create_points_for_year(8, 50, 2020, 3) diff --git a/compendium_v2/background_task/xlsx_to_csv_sheet_parsing_task.py b/compendium_v2/background_task/xlsx_to_csv_sheet_parsing_task.py deleted file mode 100644 index ee13a4b9e6917f6d37b895f9a3b859104fd65ed7..0000000000000000000000000000000000000000 --- a/compendium_v2/background_task/xlsx_to_csv_sheet_parsing_task.py +++ /dev/null @@ -1,149 +0,0 @@ -import openpyxl -import csv -import os -from compendium_v2 import db -from compendium_v2.db import model -from compendium_v2.environment import setup_logging -import logging - -setup_logging() - -logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task') - - -# Import the data to database -def import_countries(): - with db.session_scope() as session: - with open('csv/BudgetCsvFile.csv') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - data = model.BudgetEntry( - nren=row[0], budget=row[1], year=row[2]) - data.save() - session.add(data) - - -def parse_budget_xlsx_file(): - try: - # load the xlsx file - filename = "compendium_v2/background_task/xlsx" \ - "/2021_Organisation_DataSeries.xlsx " - csv_out_file = "compendium_v2/background_task/csv/BudgetCsvFile.csv" - sheet_name = "1. Budget" - wb = openpyxl.load_workbook( - filename, data_only=True, read_only=True) - - # select the active worksheet - ws = wb[sheet_name] - if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file): - os.remove(csv_out_file) - print("file deleted " + csv_out_file) - # iterate over the rows in the worksheet - for row in range(14, 57): - for col in range(3, 8): - # extract the data from the row - nren = ws.cell(row=row, column=2).value - budget = ws.cell(row=row, column=col).value - year = ws.cell(row=13, column=col).value - - if budget is not None: - budget = round(budget / 1000000, 2) - if budget > 200: - logger.info( - f'{nren} has budget set to ' - f'>200M EUR for {year}. ({budget})') - - # process the data (e.g. save to database) - # print(f"NREN: {nren}, Budget: {budget}, Year: {year}") - output_csv_file = csv.writer( - open(csv_out_file, 'a'), - delimiter=",") - output_csv_file.writerow([nren, budget, year]) - output_csv_file - except Exception as e: - print(e) - - # import_countries() - - -def parse_income_source_xlsx_file(): - try: - # load the xlsx file - filename = "compendium_v2/background_task/xlsx" \ - "/2021_Organisation_DataSeries.xlsx " - csv_out_file = "compendium_v2/background_task/csv" \ - "/FundingSourceCsvFile.csv " - sheet_name = "2. Income Sources" - wb = openpyxl.load_workbook( - filename, data_only=True, read_only=True) - - # select the active worksheet - ws = wb[sheet_name] - if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file): - os.remove(csv_out_file) - print("file deleted " + csv_out_file) - - def hard_number_convert(s, source_name, nren, year): - if s is None: - logger.info( - f'Invalid Value :{nren} has empty value for {source_name}.' - + f'for year ({year})') - return float(0) - """ Returns True if string is a number. """ - try: - return float(s) - except ValueError: - logger.info( - f'Invalid Value :{nren} has empty value for {source_name}.' - + f'for year ({year}) with value ({s})') - return float(0) - - # iterate over the rows in the worksheet - - def create_csv_per_year(start_row, end_row, yearI, col_start): - for row in range(start_row, end_row): - # extract the data from the row - nren = ws.cell(row=row, column=col_start).value - client_institution = ws.cell(row=row, - column=col_start + 3).value - european_funding = ws.cell(row=row, column=col_start + 4).value - gov_public_bodies = ws.cell(row=row, - column=col_start + 5).value - commercial = ws.cell(row=row, column=col_start + 6).value - other = ws.cell(row=row, column=col_start + 7).value - year = yearI - - client_institution = hard_number_convert( - client_institution, "client institution", nren, year) - european_funding = hard_number_convert( - european_funding, "european funding", nren, year) - gov_public_bodies = hard_number_convert( - gov_public_bodies, "gov/public_bodies", nren, year) - commercial = hard_number_convert( - commercial, "commercial", nren, year) - other = hard_number_convert( - other, "other", nren, year) - - # process the data (e.g. save to database) - if nren is not None: - output_csv_file = csv.writer( - open(csv_out_file, 'a'), - delimiter=",") - output_csv_file.writerow([nren, year, client_institution, - european_funding, - gov_public_bodies, - commercial, other]) - - # For 2020 - create_csv_per_year(8, 50, 2020, 3) - # # For 2019 - create_csv_per_year(8, 50, 2019, 12) - # # For 2018 - create_csv_per_year(8, 50, 2018, 21) - # # For 2017 - create_csv_per_year(8, 50, 2017, 32) - # # For 2016 - create_csv_per_year(8, 50, 2016, 43) - - except Exception as e: - print(e)