From 070f4a7f5e16fc0a500d5addae3b4312bb0d7828 Mon Sep 17 00:00:00 2001
From: Karel van Klink <karel.vanklink@geant.org>
Date: Tue, 11 Jul 2023 11:38:22 +0200
Subject: [PATCH] Rework the proxy steps, such that they are retried up to
 three times if needed

Also, remove a bunch of flake errors that we are not causing
---
 gso/services/provisioning_proxy.py    | 11 ++++---
 gso/translations/en-GB.json           |  2 ++
 gso/workflows/device/create_device.py | 42 ++++++++++++++++++++++-----
 tox.ini                               |  7 ++---
 4 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/gso/services/provisioning_proxy.py b/gso/services/provisioning_proxy.py
index 862a750e..11068bac 100644
--- a/gso/services/provisioning_proxy.py
+++ b/gso/services/provisioning_proxy.py
@@ -4,6 +4,7 @@ LSO is responsible for executing Ansible playbooks, that deploy subscriptions.
 """
 import json
 import logging
+from typing import NoReturn
 
 import requests
 from orchestrator import inputstep
@@ -20,6 +21,7 @@ from gso.products.product_types.device import DeviceProvisioning
 from gso.products.product_types.iptrunk import Iptrunk, IptrunkProvisioning
 
 logger = logging.getLogger(__name__)
+DEFAULT_LABEL = "Provisioning proxy is running. Please come back later for the results."
 
 
 class CUDOperation(strEnum):
@@ -140,7 +142,7 @@ def deprovision_ip_trunk(subscription: Iptrunk, process_id: UUIDstr, dry_run: bo
 
 
 @inputstep("Await provisioning proxy results", assignee=Assignee("SYSTEM"))
-def await_pp_results(subscription: SubscriptionModel, label_text: str) -> FormGenerator:
+def await_pp_results(subscription: SubscriptionModel, label_text: str = DEFAULT_LABEL) -> FormGenerator:
     class ProvisioningResultPage(FormPage):
         class Config:
             title = f"Deploying {subscription.product.name}..."
@@ -150,7 +152,7 @@ def await_pp_results(subscription: SubscriptionModel, label_text: str) -> FormGe
         confirm: Accept = Accept("INCOMPLETE")
 
         @validator("pp_run_results", allow_reuse=True, pre=True, always=True)
-        def run_results_must_be_given(cls, run_results: dict) -> dict | None:
+        def run_results_must_be_given(cls, run_results: dict) -> dict | NoReturn:
             if run_results is None:
                 raise ValueError("Run results may not be empty. Wait for the provisioning proxy to finish.")
             return run_results
@@ -172,8 +174,9 @@ def confirm_pp_results(state: State) -> FormGenerator:
 
         run_status: str = ReadOnlyField(state["pp_run_results"]["status"])
         run_results: LongText = ReadOnlyField(f"{state['pp_run_results']['output']}")
+        pp_result_good_enough: bool = state["pp_run_results"]["return_code"] == 0
         confirm: Accept = Accept("INCOMPLETE")
 
-    yield ConfirmRunPage
+    confirmation = yield ConfirmRunPage
 
-    return state
+    return {"pp_did_succeed": confirmation.pp_result_good_enough}
diff --git a/gso/translations/en-GB.json b/gso/translations/en-GB.json
index 418e7e85..ea950f27 100644
--- a/gso/translations/en-GB.json
+++ b/gso/translations/en-GB.json
@@ -5,6 +5,8 @@
             "confirm_info": "Please verify this form looks correct.",
 
             "pp_run_results": "Provisioning proxy results are not ready yet.",
+            "pp_result_good_enough": "Provisioning Proxy results look good (enough).",
+            "pp_result_good_enough_info": "If set to false, this will retry the provisioning up to two times.",
 
             "site_bgp_community_id": "Site BGP community ID",
             "site_internal_id": "Site internal ID",
diff --git a/gso/workflows/device/create_device.py b/gso/workflows/device/create_device.py
index f162f5d4..cab0f323 100644
--- a/gso/workflows/device/create_device.py
+++ b/gso/workflows/device/create_device.py
@@ -9,7 +9,7 @@ from orchestrator.forms import FormPage
 from orchestrator.forms.validators import Choice
 from orchestrator.targets import Target
 from orchestrator.types import FormGenerator, State, SubscriptionLifecycle, UUIDstr
-from orchestrator.workflow import StepList, done, init, step, workflow
+from orchestrator.workflow import StepList, conditional, done, init, step, workflow
 from orchestrator.workflows.steps import resync, set_status, store_process_subscription
 from orchestrator.workflows.utils import wrap_create_initial_input_form
 
@@ -118,7 +118,7 @@ def initialize_subscription(
 
     subscription = device.DeviceProvisioning.from_other_lifecycle(subscription, SubscriptionLifecycle.PROVISIONING)
 
-    return {"subscription": subscription}
+    return {"subscription": subscription, "pp_did_succeed": False}
 
 
 @step("Provision device [DRY RUN]")
@@ -153,24 +153,50 @@ def provision_device_real(subscription: DeviceProvisioning, process_id: UUIDstr)
     }
 
 
+@step("Reset Provisioning Proxy state")
+def reset_pp_success_state() -> State:
+    return {"pp_did_succeed": False}
+
+
 @workflow(
     "Create device",
     initial_input_form=wrap_create_initial_input_form(initial_input_form_generator),
     target=Target.CREATE,
 )
 def create_device() -> StepList:
+    should_retry_pp_steps = conditional(lambda state: not state.get("pp_did_succeed"))
+
     return (
         init
         >> create_subscription
         >> store_process_subscription(Target.CREATE)
         >> initialize_subscription
         >> get_info_from_ipam
-        >> provision_device_dry
-        >> await_pp_results
-        >> confirm_pp_results
-        >> provision_device_real
-        >> await_pp_results
-        >> confirm_pp_results
+        # First attempt at dry run
+        >> should_retry_pp_steps(provision_device_dry)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
+        # Second attempt
+        >> should_retry_pp_steps(provision_device_dry)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
+        # Third and last try
+        >> should_retry_pp_steps(provision_device_dry)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
+        >> reset_pp_success_state
+        # First attempt at provisioning
+        >> should_retry_pp_steps(provision_device_real)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
+        # Second attempt
+        >> should_retry_pp_steps(provision_device_real)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
+        # Third and last try
+        >> should_retry_pp_steps(provision_device_real)
+        >> should_retry_pp_steps(await_pp_results)
+        >> should_retry_pp_steps(confirm_pp_results)
         >> set_status(SubscriptionLifecycle.ACTIVE)
         >> resync
         >> done
diff --git a/tox.ini b/tox.ini
index 9ecc3385..8bfef47e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,14 +1,11 @@
 [flake8]
-ignore = D100,D101,D102,D103,D104,D105,D106,D107,D202,E501,RST301,RST304,W503,E203,C417,T202,S101
-; extend-ignore = E203
+; Allow >> on newline (W503), and allow cls as first argument for pydantic validators (B902)
+ignore = B902,W503
 exclude = .git,.*_cache,.eggs,*.egg-info,__pycache__,venv,.tox,gso/migrations,docs
 enable-extensions = G
 select = B,C,D,E,F,G,I,N,S,T,W,B902,B903,R
 max-line-length = 120
 ban-relative-imports = true
-per-file-ignores =
-	# Allow first argument to be cls instead of self for pydantic validators
-	gso/*: B902
 
 [testenv]
 deps =
-- 
GitLab