Update excel parsing module to use generators instead of CSV

3fd7c8e5 · Bjarke Madsen · bac7e55d · 3fd7c8e5 · bac7e55d
Commit 3fd7c8e5 authored 2 years ago by Bjarke Madsen
--- a/compendium_v2/background_task/parse_excel_data.py
+++ b/compendium_v2/background_task/parse_excel_data.py
+import openpyxl
+import csv
+import os
+from compendium_v2 import db
+from compendium_v2.db import model
+from compendium_v2.environment import setup_logging
+import logging
+setup_logging()
+logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task')
+EXCEL_FILE = os.path.join(
+    os.path.dirname(__file__), "xlsx",
+    "2021_Organisation_DataSeries.xlsx")
+def import_countries():
+    with db.session_scope() as session:
+        with open('csv/BudgetCsvFile.csv') as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row in reader:
+                data = model.BudgetEntry(
+                    nren=row[0], budget=row[1], year=row[2])
+                data.save()
+    session.add(data)
+def fetch_budget_excel_data():
+    # load the xlsx file
+    sheet_name = "1. Budget"
+    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    # select the active worksheet
+    ws = wb[sheet_name]
+    # iterate over the rows in the worksheet
+    for row in range(14, 57):
+        for col in range(3, 8):
+            # extract the data from the row
+            nren = ws.cell(row=row, column=2).value
+            budget = ws.cell(row=row, column=col).value
+            year = ws.cell(row=13, column=col).value
+            if budget is not None:
+                budget = round(budget / 1000000, 2)
+                if budget > 200:
+                    logger.info(
+                        f'{nren} has budget set to '
+                        f'>200M EUR for {year}. ({budget})')
+                # process the data (e.g. save to database)
+                # print(f"NREN: {nren}, Budget: {budget}, Year: {year}")
+                yield nren, budget, year
+def fetch_funding_excel_data():
+    # load the xlsx file
+    wb = openpyxl.load_workbook(EXCEL_FILE, data_only=True, read_only=True)
+    # select the active worksheet
+    sheet_name = "2. Income Sources"
+    ws = wb[sheet_name]
+    def hard_number_convert(s, source_name, nren, year):
+        if s is None:
+            logger.info(
+                f'Invalid Value :{nren} has empty value for {source_name}.'
+                + f'for year ({year})')
+            return float(0)
+        """ Returns True if string is a number. """
+        try:
+            return float(s)
+        except ValueError:
+            logger.info(
+                f'Invalid Value :{nren} has empty value for {source_name}.'
+                + f'for year ({year}) with value ({s})')
+            return float(0)
+    # iterate over the rows in the worksheet
+    def create_points_for_year(start_row, end_row, year, col_start):
+        for row in range(start_row, end_row):
+            # extract the data from the row
+            nren = ws.cell(row=row, column=col_start).value
+            client_institution = ws.cell(row=row,
+                                         column=col_start + 3).value
+            european_funding = ws.cell(row=row, column=col_start + 4).value
+            gov_public_bodies = ws.cell(row=row,
+                                        column=col_start + 5).value
+            commercial = ws.cell(row=row, column=col_start + 6).value
+            other = ws.cell(row=row, column=col_start + 7).value
+            client_institution = hard_number_convert(
+                client_institution, "client institution", nren, year)
+            european_funding = hard_number_convert(
+                european_funding, "european funding", nren, year)
+            gov_public_bodies = hard_number_convert(
+                gov_public_bodies, "gov/public_bodies", nren, year)
+            commercial = hard_number_convert(
+                commercial, "commercial", nren, year)
+            other = hard_number_convert(
+                other, "other", nren, year)
+            # process the data (e.g. save to database)
+            if nren is not None:
+                yield (nren, year, client_institution,
+                       european_funding,
+                       gov_public_bodies,
+                       commercial, other)
+    # For 2016
+    yield from create_points_for_year(8, 50, 2016, 43)
+    # For 2017
+    yield from create_points_for_year(8, 50, 2017, 32)
+    # For 2018
+    yield from create_points_for_year(8, 50, 2018, 21)
+    # For 2019
+    yield from create_points_for_year(8, 50, 2019, 12)
+    # For 2020
+    yield from create_points_for_year(8, 50, 2020, 3)
--- a/compendium_v2/background_task/xlsx_to_csv_sheet_parsing_task.py
+++ b/compendium_v2/background_task/xlsx_to_csv_sheet_parsing_task.py
-import openpyxl
-import csv
-import os
-from compendium_v2 import db
-from compendium_v2.db import model
-from compendium_v2.environment import setup_logging
-import logging
-setup_logging()
-logger = logging.getLogger('xlsx_to_csv_sheet_parsing_task')
-# Import the data to database
-def import_countries():
-    with db.session_scope() as session:
-        with open('csv/BudgetCsvFile.csv') as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row in reader:
-                data = model.BudgetEntry(
-                    nren=row[0], budget=row[1], year=row[2])
-                data.save()
-    session.add(data)
-def parse_budget_xlsx_file():
-    try:
-        # load the xlsx file
-        filename = "compendium_v2/background_task/xlsx" \
-                   "/2021_Organisation_DataSeries.xlsx "
-        csv_out_file = "compendium_v2/background_task/csv/BudgetCsvFile.csv"
-        sheet_name = "1. Budget"
-        wb = openpyxl.load_workbook(
-            filename, data_only=True, read_only=True)
-        # select the active worksheet
-        ws = wb[sheet_name]
-        if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file):
-            os.remove(csv_out_file)
-            print("file deleted " + csv_out_file)
-        # iterate over the rows in the worksheet
-        for row in range(14, 57):
-            for col in range(3, 8):
-                # extract the data from the row
-                nren = ws.cell(row=row, column=2).value
-                budget = ws.cell(row=row, column=col).value
-                year = ws.cell(row=13, column=col).value
-                if budget is not None:
-                    budget = round(budget / 1000000, 2)
-                    if budget > 200:
-                        logger.info(
-                            f'{nren} has budget set to '
-                            f'>200M EUR for {year}. ({budget})')
-                    # process the data (e.g. save to database)
-                    # print(f"NREN: {nren}, Budget: {budget}, Year: {year}")
-                    output_csv_file = csv.writer(
-                        open(csv_out_file, 'a'),
-                        delimiter=",")
-                    output_csv_file.writerow([nren, budget, year])
-        output_csv_file
-    except Exception as e:
-        print(e)
-        # import_countries()
-def parse_income_source_xlsx_file():
-    try:
-        # load the xlsx file
-        filename = "compendium_v2/background_task/xlsx" \
-                   "/2021_Organisation_DataSeries.xlsx "
-        csv_out_file = "compendium_v2/background_task/csv" \
-                       "/FundingSourceCsvFile.csv "
-        sheet_name = "2. Income Sources"
-        wb = openpyxl.load_workbook(
-            filename, data_only=True, read_only=True)
-        # select the active worksheet
-        ws = wb[sheet_name]
-        if os.path.exists(csv_out_file) and os.path.isfile(csv_out_file):
-            os.remove(csv_out_file)
-            print("file deleted " + csv_out_file)
-        def hard_number_convert(s, source_name, nren, year):
-            if s is None:
-                logger.info(
-                    f'Invalid Value :{nren} has empty value for {source_name}.'
-                    + f'for year ({year})')
-                return float(0)
-            """ Returns True if string is a number. """
-            try:
-                return float(s)
-            except ValueError:
-                logger.info(
-                    f'Invalid Value :{nren} has empty value for {source_name}.'
-                    + f'for year ({year}) with value ({s})')
-                return float(0)
-        # iterate over the rows in the worksheet
-        def create_csv_per_year(start_row, end_row, yearI, col_start):
-            for row in range(start_row, end_row):
-                # extract the data from the row
-                nren = ws.cell(row=row, column=col_start).value
-                client_institution = ws.cell(row=row,
-                                             column=col_start + 3).value
-                european_funding = ws.cell(row=row, column=col_start + 4).value
-                gov_public_bodies = ws.cell(row=row,
-                                            column=col_start + 5).value
-                commercial = ws.cell(row=row, column=col_start + 6).value
-                other = ws.cell(row=row, column=col_start + 7).value
-                year = yearI
-                client_institution = hard_number_convert(
-                    client_institution, "client institution", nren, year)
-                european_funding = hard_number_convert(
-                    european_funding, "european funding", nren, year)
-                gov_public_bodies = hard_number_convert(
-                    gov_public_bodies, "gov/public_bodies", nren, year)
-                commercial = hard_number_convert(
-                    commercial, "commercial", nren, year)
-                other = hard_number_convert(
-                    other, "other", nren, year)
-                # process the data (e.g. save to database)
-                if nren is not None:
-                    output_csv_file = csv.writer(
-                        open(csv_out_file, 'a'),
-                        delimiter=",")
-                    output_csv_file.writerow([nren, year, client_institution,
-                                              european_funding,
-                                              gov_public_bodies,
-                                              commercial, other])
-        # For 2020
-        create_csv_per_year(8, 50, 2020, 3)
-        # # For 2019
-        create_csv_per_year(8, 50, 2019, 12)
-        # # For 2018
-        create_csv_per_year(8, 50, 2018, 21)
-        # # For 2017
-        create_csv_per_year(8, 50, 2017, 32)
-        # # For 2016
-        create_csv_per_year(8, 50, 2016, 43)
-    except Exception as e:
-        print(e)