← Back to team overview

canonical-ubuntu-qa team mailing list archive

[Merge] ~andersson123/autopkgtest-cloud:copy-security-group-more-robust into autopkgtest-cloud:master

 

Tim Andersson has proposed merging ~andersson123/autopkgtest-cloud:copy-security-group-more-robust into autopkgtest-cloud:master.

Requested reviews:
  Canonical's Ubuntu QA (canonical-ubuntu-qa)

For more details, see:
https://code.launchpad.net/~andersson123/autopkgtest-cloud/+git/autopkgtest-cloud/+merge/477569

Adds a timeout, and sleep, etc, to make calls to nova more robust, in order to avoid having autopkgtest@*.service services end up in an inactive/dead state
-- 
Your team Canonical's Ubuntu QA is requested to review the proposed merge of ~andersson123/autopkgtest-cloud:copy-security-group-more-robust into autopkgtest-cloud:master.
diff --git a/charms/focal/autopkgtest-cloud-worker/autopkgtest-cloud/tools/copy-security-group b/charms/focal/autopkgtest-cloud-worker/autopkgtest-cloud/tools/copy-security-group
index 3de9a98..f1df12a 100755
--- a/charms/focal/autopkgtest-cloud-worker/autopkgtest-cloud/tools/copy-security-group
+++ b/charms/focal/autopkgtest-cloud-worker/autopkgtest-cloud/tools/copy-security-group
@@ -9,11 +9,37 @@ If --delete-only is given, it only deletes existing groups called NAME.
 
 import argparse
 import os
+import signal
+import time
+from contextlib import contextmanager
 
 from keystoneauth1 import session
+from keystoneauth1.exceptions import InternalServerError
 from keystoneauth1.identity import v2, v3
 from neutronclient.v2_0 import client
 
+
+class TimeOutException(Exception):
+    pass
+
+
+@contextmanager
+def raise_timeout(timeout):
+    def _handler(signum, frame):
+        raise TimeOutException
+
+    signal.signal(signal.SIGALRM, _handler)
+    signal.alarm(timeout)
+
+    try:
+        yield
+    except TimeOutException:
+        print(f"Timeout after {timeout} seconds")
+        raise
+    finally:
+        signal.alarm(0)
+
+
 # Members in a security group rule that cannot be copied.
 RULE_MEMBERS_IGNORE = [
     "id",
@@ -27,6 +53,10 @@ RULE_MEMBERS_IGNORE = [
     "normalized_cidr",
 ]
 
+# keystoneauth1.exceptions.http.InternalServerError
+NOVA_TIMEOUT = 600
+NOVA_RETRY_SLEEP_DURATION = 30
+
 
 def main():
     parser = argparse.ArgumentParser(description="Copy security groups")
@@ -68,43 +98,87 @@ def main():
         region_name=os.environ["OS_REGION_NAME"],
     )
 
-    # Find the source group - crashes if it does not exists
-    source = [
-        g
-        for g in neutron.list_security_groups()["security_groups"]
-        if g["name"] == args.source
-    ][0]
+    security_groups = None
+    with raise_timeout(NOVA_TIMEOUT):
+        while not security_groups:
+            try:
+                security_groups = neutron.list_security_groups()[
+                    "security_groups"
+                ]
+            except InternalServerError as e:
+                print(f"Listing security groups failed with {e}")
+                time.sleep(NOVA_RETRY_SLEEP_DURATION)
+
+    source = [g for g in security_groups if g["name"] == args.source][0]
 
     description = "copy {} of {} ({})".format(
         args.name, args.source, source["description"]
     )
 
     # Delete any existing group with the same name
-    existing_groups = [
-        g
-        for g in neutron.list_security_groups()["security_groups"]
-        if g["name"] == args.name
-    ]
-    existing_ports = neutron.list_ports()["ports"]
+    existing_groups = [g for g in security_groups if g["name"] == args.name]
+
+    existing_ports = None
+    with raise_timeout(NOVA_TIMEOUT):
+        while not existing_ports:
+            try:
+                existing_ports = neutron.list_ports()["ports"]
+            except InternalServerError as e:
+                print(f"Listing ports failed with {e}")
+                time.sleep(NOVA_RETRY_SLEEP_DURATION)
+
     for target in existing_groups:
         print("Deleting existing group", target)
         for port in existing_ports:
             if target["id"] in port["security_groups"]:
                 print("Deleting port in group:", target["id"])
+                deleted = False
+                with raise_timeout(NOVA_TIMEOUT):
+                    while not deleted:
+                        try:
+                            neutron.delete_port(port["id"])
+                            deleted = True
+                        except Exception as e:
+                            print(f"Could not delete port: {e}")
+                            time.sleep(NOVA_RETRY_SLEEP_DURATION)
+
+        with raise_timeout(NOVA_TIMEOUT):
+            deleted = False
+            while not deleted:
                 try:
-                    neutron.delete_port(port["id"])
+                    neutron.delete_security_group(target["id"])
                 except Exception as e:
-                    print("Could not delete port:", e)
-        neutron.delete_security_group(target["id"])
+                    print(f"Could not delete security group: {e}")
+                    time.sleep(NOVA_RETRY_SLEEP_DURATION)
 
     if not args.delete_only:
         print("Creating", description)
-        target = neutron.create_security_group(
-            {"security_group": {"name": args.name, "description": description}}
-        )["security_group"]
+        with raise_timeout(NOVA_TIMEOUT):
+            target = None
+            while not target:
+                try:
+                    target = neutron.create_security_group(
+                        {
+                            "security_group": {
+                                "name": args.name,
+                                "description": description,
+                            }
+                        }
+                    )["security_group"]
+                except Exception as e:
+                    print(f"Failed to create security group: {e}")
+                    time.sleep(NOVA_RETRY_SLEEP_DURATION)
 
         for rule in target["security_group_rules"]:
-            neutron.delete_security_group_rule(rule["id"])
+            with raise_timeout(NOVA_TIMEOUT):
+                deleted = False
+                while not deleted:
+                    try:
+                        neutron.delete_security_group_rule(rule["id"])
+                        deleted = True
+                    except Exception as e:
+                        print(f"Failed to delete security group with: {e}")
+                        time.sleep(NOVA_RETRY_SLEEP_DURATION)
 
         for rule in source["security_group_rules"]:
             rule = {
@@ -116,7 +190,19 @@ def main():
             rule["security_group_id"] = target["id"]
 
             print("Copying rule", rule)
-            neutron.create_security_group_rule({"security_group_rule": rule})
+            with raise_timeout(NOVA_TIMEOUT):
+                created = False
+                while not created:
+                    try:
+                        neutron.create_security_group_rule(
+                            {"security_group_rule": rule}
+                        )
+                        created = True
+                    except Exception as e:
+                        print(
+                            f"Failed to create security group rule with: {e}"
+                        )
+                        time.sleep(NOVA_RETRY_SLEEP_DURATION)
 
 
 if __name__ == "__main__":