diff --git a/compendium_v2/publishers/helpers.py b/compendium_v2/publishers/helpers.py index 358492da0a0cdd6504ff3c9c3f3daafd9cb5ee81..f6492d2d93e97db681a6a93b768ef7a70c87f0d1 100644 --- a/compendium_v2/publishers/helpers.py +++ b/compendium_v2/publishers/helpers.py @@ -1,11 +1,15 @@ import re +from typing import List from sqlalchemy import select from compendium_v2.db import db, model URL_PATTERN = re.compile( - r'\b(https?://[^\s<>";,(){}\[\]!\\]+|www\.[^\s<>";,(){}\[\]!\\]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b(?=\s|\b|[,!?.;:\\])' + (r'\b(https?://[^\s<>";,(){}\[\]!\\]+' + r'|www\.[^\s<>";,(){}\[\]!\\]+' + r'|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})' + r'\b(?=\s|\b|[,!?.;:\\])') ) @@ -30,5 +34,5 @@ def get_uppercase_nren_dict(): return nren_dict -def extract_urls(text: str) -> list[str]: +def extract_urls(text: str) -> List[str]: return re.findall(URL_PATTERN, text) diff --git a/compendium_v2/publishers/survey_publisher_2022.py b/compendium_v2/publishers/survey_publisher_2022.py index 14b16afa5ed6ac748d2c4d4aa7d3ea007764d801..f0591e531ce30f76367c43f4c3b551f95ddbae3c 100644 --- a/compendium_v2/publishers/survey_publisher_2022.py +++ b/compendium_v2/publishers/survey_publisher_2022.py @@ -68,16 +68,13 @@ INSTITUTIONS_URLS_QUERY_UNTIL_2022 = """ JOIN sections s ON q.section_id = s.id JOIN compendia c ON s.compendium_id = c.id WHERE q.id = 16507 - UNION ALL - -- Recursive case SELECT q.id, q.equivalent_question_id, c.year, q.title FROM questions q INNER JOIN parent_questions pq ON q.id = pq.equivalent_question_id JOIN sections s ON q.section_id = s.id JOIN compendia c ON s.compendium_id = c.id) - SELECT DISTINCT ON (n.id, answers.question_id) answers.id, UPPER(n.abbreviation) AS nren, parent_questions.year, diff --git a/test/test_helpers.py b/test/test_helpers.py index 6870f98df979d5d35855ef25dfcdadf4051a6146..520637484b3d19d29327ae6ec85eee5fd9a53752 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -33,7 +33,10 @@ from compendium_v2.publishers.helpers import extract_urls ("URL with encoded characters: https://example.com/path%20to%20page", ["https://example.com/path%20to%20page"]), ("URL with encoded query: https://example.com/path?query=value%20with%20space", ["https://example.com/path?query=value%20with%20space"]), - ("[URL with encoded hash: https://example.com/path#section%20two;]", ["https://example.com/path#section%20two"]), + ( + "[URL with encoded hash: https://example.com/path#section%20two;]", + ["https://example.com/path#section%20two"] + ), ] ) def test_extract_urls_from_a_text(text, expected): diff --git a/test/test_survey_publisher_2022.py b/test/test_survey_publisher_2022.py index 4b406c6ac0283d0cfd8f707038ec2799918dcbe8..63fb83c3b44ff3e1b96b6b1f81c43057c42fbe24 100644 --- a/test/test_survey_publisher_2022.py +++ b/test/test_survey_publisher_2022.py @@ -194,10 +194,17 @@ def test_publisher(app_with_survey_db, mocker, dummy_config): ('nren3', 'n.a. online'), ] + def institutions_urls_data(): + return [ + (87483, 'ANA', 2013, "http://www.rash.al/index.php/network/points-of-presence-pop"), + (163286, 'ANA', 2014, "http://www.rash.al/index.php/network/points-of-presence-pop"), + ] + mocker.patch('compendium_v2.publishers.survey_publisher_2022.query_budget', get_rows_as_tuples) mocker.patch('compendium_v2.publishers.survey_publisher_2022.query_funding_sources', funding_source_data) mocker.patch('compendium_v2.publishers.survey_publisher_2022.query_question', question_data) mocker.patch('compendium_v2.publishers.survey_publisher_2022.query_question_id', question_id_data) + mocker.patch('compendium_v2.publishers.survey_publisher_2022.query_institutions_urls', institutions_urls_data) nren_names = ['Nren1', 'Nren2', 'Nren3', 'Nren4', 'SURF', 'KIFU', 'University of Malta', 'ASNET-AM', 'SIKT', 'LAT', 'RASH', 'ANAS', 'GRNET', 'CSC'] @@ -302,3 +309,14 @@ def test_publisher(app_with_survey_db, mocker, dummy_config): assert len(policy_data_2022) == 2 assert policy_data_2020[0].strategic_plan == 'www.nren.com/somepolicy2020.pdf' assert policy_data_2020[1].strategic_plan == 'policyemail@nren.com' + + _institution_urls_data = db.session.scalars( + select(model.InstitutionURLs).order_by(model.InstitutionURLs.nren_id.asc()) + ).all() + assert len(_institution_urls_data) == 2 + assert _institution_urls_data[0].nren.name.lower() == 'rash' + assert _institution_urls_data[0].year == 2013 + assert _institution_urls_data[0].urls == ["http://www.rash.al/index.php/network/points-of-presence-pop"] + assert _institution_urls_data[1].nren.name.lower() == 'rash' + assert _institution_urls_data[1].year == 2014 + assert _institution_urls_data[1].urls == ["http://www.rash.al/index.php/network/points-of-presence-pop"]