Skip to content
Snippets Groups Projects
Commit 3fd7c8e5 authored by Bjarke Madsen's avatar Bjarke Madsen
Browse files

Update excel parsing module to use generators instead of CSV

parent bac7e55d
No related branches found
No related tags found
No related merge requests found
import openpyxl
import csv
import os
from compendium_v2 import db
from compendium_v2.db import model
from compendium_v2.environment import setup_logging
import logging
setup_logging()
logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task')
EXCEL_FILE = os.path.join(
os.path.dirname(__file__), "xlsx",
"2021_Organisation_DataSeries.xlsx")
def import_countries():
with db.session_scope() as session:
with open('csv/BudgetCsvFile.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data = model.BudgetEntry(
nren=row[0], budget=row[1], year=row[2])
data.save()
session.add(data)
def fetch_budget_excel_data():
# load the xlsx file
sheet_name = "1. Budget"
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
ws = wb[sheet_name]
# iterate over the rows in the worksheet
for row in range(14, 57):
for col in range(3, 8):
# extract the data from the row
nren = ws.cell(row=row, column=2).value
budget = ws.cell(row=row, column=col).value
year = ws.cell(row=13, column=col).value
if budget is not None:
budget = round(budget / 1000000, 2)
if budget > 200:
logger.info(
f'{nren} has budget set to '
f'>200M EUR for {year}. ({budget})')
# process the data (e.g. save to database)
# print(f"NREN: {nren}, Budget: {budget}, Year: {year}")
yield nren, budget, year
def fetch_funding_excel_data():
# load the xlsx file
wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
# select the active worksheet
sheet_name = "2. Income Sources"
ws = wb[sheet_name]
def hard_number_convert(s, source_name, nren, year):
if s is None:
logger.info(
f'Invalid Value :{nren} has empty value for {source_name}.'
+ f'for year ({year})')
return float(0)
""" Returns True if string is a number. """
try:
return float(s)
except ValueError:
logger.info(
f'Invalid Value :{nren} has empty value for {source_name}.'
+ f'for year ({year}) with value ({s})')
return float(0)
# iterate over the rows in the worksheet
def create_points_for_year(start_row, end_row, year, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=col_start).value
client_institution = ws.cell(row=row,
column=col_start + 3).value
european_funding = ws.cell(row=row, column=col_start + 4).value
gov_public_bodies = ws.cell(row=row,
column=col_start + 5).value
commercial = ws.cell(row=row, column=col_start + 6).value
other = ws.cell(row=row, column=col_start + 7).value
client_institution = hard_number_convert(
client_institution, "client institution", nren, year)
european_funding = hard_number_convert(
european_funding, "european funding", nren, year)
gov_public_bodies = hard_number_convert(
gov_public_bodies, "gov/public_bodies", nren, year)
commercial = hard_number_convert(
commercial, "commercial", nren, year)
other = hard_number_convert(
other, "other", nren, year)
# process the data (e.g. save to database)
if nren is not None:
yield (nren, year, client_institution,
european_funding,
gov_public_bodies,
commercial, other)
# For 2016
yield from create_points_for_year(8, 50, 2016, 43)
# For 2017
yield from create_points_for_year(8, 50, 2017, 32)
# For 2018
yield from create_points_for_year(8, 50, 2018, 21)
# For 2019
yield from create_points_for_year(8, 50, 2019, 12)
# For 2020
yield from create_points_for_year(8, 50, 2020, 3)
import openpyxl
import csv
import os
from compendium_v2 import db
from compendium_v2.db import model
from compendium_v2.environment import setup_logging
import logging
setup_logging()
logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task')
# Import the data to database
def import_countries():
with db.session_scope() as session:
with open('csv/BudgetCsvFile.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data = model.BudgetEntry(
nren=row[0], budget=row[1], year=row[2])
data.save()
session.add(data)
def parse_budget_xlsx_file():
try:
# load the xlsx file
filename = "compendium_v2/background_task/xlsx" \
"/2021_Organisation_DataSeries.xlsx "
csv_out_file = "compendium_v2/background_task/csv/BudgetCsvFile.csv"
sheet_name = "1. Budget"
wb = openpyxl.load_workbook(
filename, data_only=True, read_only=True)
# select the active worksheet
ws = wb[sheet_name]
if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file):
os.remove(csv_out_file)
print("file deleted " + csv_out_file)
# iterate over the rows in the worksheet
for row in range(14, 57):
for col in range(3, 8):
# extract the data from the row
nren = ws.cell(row=row, column=2).value
budget = ws.cell(row=row, column=col).value
year = ws.cell(row=13, column=col).value
if budget is not None:
budget = round(budget / 1000000, 2)
if budget > 200:
logger.info(
f'{nren} has budget set to '
f'>200M EUR for {year}. ({budget})')
# process the data (e.g. save to database)
# print(f"NREN: {nren}, Budget: {budget}, Year: {year}")
output_csv_file = csv.writer(
open(csv_out_file, 'a'),
delimiter=",")
output_csv_file.writerow([nren, budget, year])
output_csv_file
except Exception as e:
print(e)
# import_countries()
def parse_income_source_xlsx_file():
try:
# load the xlsx file
filename = "compendium_v2/background_task/xlsx" \
"/2021_Organisation_DataSeries.xlsx "
csv_out_file = "compendium_v2/background_task/csv" \
"/FundingSourceCsvFile.csv "
sheet_name = "2. Income Sources"
wb = openpyxl.load_workbook(
filename, data_only=True, read_only=True)
# select the active worksheet
ws = wb[sheet_name]
if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file):
os.remove(csv_out_file)
print("file deleted " + csv_out_file)
def hard_number_convert(s, source_name, nren, year):
if s is None:
logger.info(
f'Invalid Value :{nren} has empty value for {source_name}.'
+ f'for year ({year})')
return float(0)
""" Returns True if string is a number. """
try:
return float(s)
except ValueError:
logger.info(
f'Invalid Value :{nren} has empty value for {source_name}.'
+ f'for year ({year}) with value ({s})')
return float(0)
# iterate over the rows in the worksheet
def create_csv_per_year(start_row, end_row, yearI, col_start):
for row in range(start_row, end_row):
# extract the data from the row
nren = ws.cell(row=row, column=col_start).value
client_institution = ws.cell(row=row,
column=col_start + 3).value
european_funding = ws.cell(row=row, column=col_start + 4).value
gov_public_bodies = ws.cell(row=row,
column=col_start + 5).value
commercial = ws.cell(row=row, column=col_start + 6).value
other = ws.cell(row=row, column=col_start + 7).value
year = yearI
client_institution = hard_number_convert(
client_institution, "client institution", nren, year)
european_funding = hard_number_convert(
european_funding, "european funding", nren, year)
gov_public_bodies = hard_number_convert(
gov_public_bodies, "gov/public_bodies", nren, year)
commercial = hard_number_convert(
commercial, "commercial", nren, year)
other = hard_number_convert(
other, "other", nren, year)
# process the data (e.g. save to database)
if nren is not None:
output_csv_file = csv.writer(
open(csv_out_file, 'a'),
delimiter=",")
output_csv_file.writerow([nren, year, client_institution,
european_funding,
gov_public_bodies,
commercial, other])
# For 2020
create_csv_per_year(8, 50, 2020, 3)
# # For 2019
create_csv_per_year(8, 50, 2019, 12)
# # For 2018
create_csv_per_year(8, 50, 2018, 21)
# # For 2017
create_csv_per_year(8, 50, 2017, 32)
# # For 2016
create_csv_per_year(8, 50, 2016, 43)
except Exception as e:
print(e)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment