From 11d4e71cdc6b0da5106d5066df8c1df1fdbcc57f Mon Sep 17 00:00:00 2001 From: Marco Malavolti <marco.malavolti@gmail.com> Date: Thu, 23 Jul 2020 00:24:11 +0200 Subject: [PATCH] Added IdP exclusion with robots.txt --- README.md | 7 +++++- eccs2.py | 53 ++++++++++++++++++++++++++++++---------------- eccs2properties.py | 3 +++ 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index b614fc9..f962745 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,12 @@ There are some situations where the check cannot work reliably. In those cases i # Disable Checks -In cases where an IdP cannot be reliably checked, it is necessary to create, also empty, `eccs-disabled.txt` file on IdP's web root. +In cases where an IdP cannot be reliably checked, it is necessary to create or enrich the `robots.txt` file on the IdP's web root with: + +```bash +User-agent: ECCS +Disallow: / +``` # On-line interface diff --git a/eccs2.py b/eccs2.py index 00d8245..f72f50c 100755 --- a/eccs2.py +++ b/eccs2.py @@ -6,7 +6,7 @@ import json import re import requests -from eccs2properties import DAY, ECCS2HTMLDIR, ECCS2OUTPUTDIR, ECCS2RESULTSLOG, FEDS_BLACKLIST, IDPS_BLACKLIST, ECCS2SPS, ECCS2SELENIUMDEBUG +from eccs2properties import DAY, ECCS2HTMLDIR, ECCS2OUTPUTDIR, ECCS2RESULTSLOG, FEDS_BLACKLIST, IDPS_BLACKLIST, ECCS2SPS, ECCS2SELENIUMDEBUG,ROBOTS_USER_AGENT from pathlib import Path from selenium.common.exceptions import TimeoutException from urllib3.util import parse_url @@ -26,6 +26,15 @@ def getIDPfqdn(entityIDidp): else: return entityIDidp.split(":")[-1] +# Return True if the ECCS check MUST not be run +def checkRobots(url_robots_txt): + robots_txt = requests.get(url_robots_txt) + p = re.compile('^User-agent:\sECCS\sDisallow:\s\/\s*$', re.MULTILINE) + m = p.search(robots_txt.text) + if (m): + return True + else: + return False # The function check that the IdP recognized the SP by presenting its Login page. # If the IdP Login page contains "username" and "password" fields, than the test is passed. @@ -51,36 +60,44 @@ def checkIdP(sp,idp,test): fqdn_sp = parse_url(sp)[2] wayfless_url = sp + idp['entityID'] - exclude_idp = "" + robots = "" try: - headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'} - exclude_idp = requests.get("https://%s/eccs-disabled.txt" % fqdn_idp, headers=headers, verify=False, timeout=30) - - if (exclude_idp == ""): - exclude_idp = requests.get("http://%s/eccs-disabled.txt" % fqdn_idp, headers=headers, verify=False, timeout=30) + headers = { + 'User-Agent': '%s' % ROBOTS_USER_AGENT + } - except requests.exceptions.ConnectionError as e: - print("!!! ECCS-DISABLED REQUESTS CONNECTION ERROR EXCEPTION !!!") - #print (e.__str__()) - exclude_idp = "" + robots = requests.get("https://%s/robots.txt" % fqdn_idp, headers=headers, verify=True, timeout=30) - except requests.exceptions.Timeout as e: - print("!!! ECCS-DISABLED REQUESTS TIMEOUT EXCEPTION !!!") - #print (e.__str__()) - exclude_idp = "" + if (robots == ""): + robots = requests.get("http://%s/robots.txt" % fqdn_idp, headers=headers, verify=True, timeout=30) - if (exclude_idp): + except (requests.exceptions.ConnectionError,requests.exceptions.Timeout,requests.exceptions.SSLError) as e: check_time = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z' if (test is not True): with open("%s/%s/%s---%s.html" % (ECCS2HTMLDIR,DAY,fqdn_idp,fqdn_sp),"w") as html: - html.write("IdP excluded from check by eccs-disabled.txt") + html.write("IdP excluded from check because the download of 'robots.txt' failed: %s" % e.__str__()) else: - print("IdP excluded from check by eccs-disabled.txt") + print("IdP excluded from check because the download of 'robots.txt' failed: %s" % e.__str__()) return (idp['entityID'],wayfless_url,check_time,"NULL","DISABLED") + if (robots): + check_time = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z' + + p = re.compile('^User-agent:\sECCS\sDisallow:\s\/\s*$', re.MULTILINE) + m = p.search(robots.text) + + if (m): + if (test is not True): + with open("%s/%s/%s---%s.html" % (ECCS2HTMLDIR,DAY,fqdn_idp,fqdn_sp),"w") as html: + html.write("IdP excluded from check by robots.txt") + else: + print("IdP excluded from check by robots.txt") + + return (idp['entityID'],wayfless_url,check_time,"NULL","DISABLED") + if (idp['registrationAuthority'] in federation_blacklist): check_time = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z' diff --git a/eccs2properties.py b/eccs2properties.py index 33312c2..c8e76a3 100644 --- a/eccs2properties.py +++ b/eccs2properties.py @@ -39,6 +39,9 @@ ECCS2NUMPROCESSES = 25 # The 2 SPs that will be used to test each IdP ECCS2SPS = ["https://sp24-test.garr.it/Shibboleth.sso/Login?entityID=", "https://attribute-viewer.aai.switch.ch/Shibboleth.sso/Login?entityID="] +# ROBOTS.TXT +ROBOTS_USER_AGENT = "ECCS/2.0 (+https://dev-mm.aai-test.garr.it/eccs2)" + # Registration Authority of Federations to exclude from the check FEDS_BLACKLIST = [ 'http://www.surfconext.nl/', -- GitLab