From a8c85cd712b7e29d0615c4b75626d84addf543fe Mon Sep 17 00:00:00 2001
From: dobli <dobler.alex@gmail.com>
Date: Thu, 25 Apr 2019 02:04:10 +0200
Subject: [PATCH] refined restore wait times

---
 building_manager.py | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/building_manager.py b/building_manager.py
index 41b4505..ec80813 100755
--- a/building_manager.py
+++ b/building_manager.py
@@ -1272,6 +1272,7 @@ def remove_label_from_nodes(label, value, manager=None):
     :label: Label you want to remove
     :value: The value to match before removing
     :manager: Docker machine to use for command, otherwise local
+    :return: Nodes with removed label
     """
     client = get_docker_client(manager)
 
@@ -1287,6 +1288,7 @@ def remove_label_from_nodes(label, value, manager=None):
         logging.info(f'Remove label {label} with value {value} from {m}')
 
     client.close()
+    return [n.id for n in matching_nodes]
 
 
 def assign_label_to_node(nodeid, label, value, manager=None):
@@ -1368,11 +1370,17 @@ def restore_building_backup(manager, building, new_machine=None):
 
     # When a new machine is used, (un-)assign labels
     if new_machine:
-        remove_label_from_nodes('building', building, manager)
+        old_nodes = remove_label_from_nodes('building', building, manager)
         assign_label_to_node(new_machine, 'building', building, manager)
         print("Wait for services to start on new machine")
-        sleep(10)
-        run_command_in_service('backup', 'restore', new_machine)
+        if wait_for_containers(new_machine, 'backup|sftp', expected_count=2):
+            run_command_in_service('backup', 'restore', new_machine)
+        else:
+            logging.error(f"Failed to start services on {new_machine}")
+            # restore labels to old nodes
+            remove_label_from_nodes('building', building, manager)
+            for on in old_nodes:
+                assign_label_to_node(on, 'building', building, manager)
     else:
         # execute restore command in backup service
         run_command_in_service('backup', 'restore', manager)
@@ -1384,6 +1392,30 @@ def restore_building_backup(manager, building, new_machine=None):
 
     # close client
     client.close()
+
+
+def wait_for_containers(machine, name_filter, expected_count=1, timeout=60):
+    """Waits until containers matching filters are available
+
+    :machine: machine to check for container
+    :name_filter: regexp to filter names by
+    :expected_count: number of services that are expected to match
+    :timeout: Time to at least wait for before abborting check
+    :returns: true if found, false when timed out
+    """
+    client = get_docker_client(machine)
+    for t in range(timeout):
+        cl = client.containers.list(filters={'name': name_filter})
+        if len(cl) >= expected_count:
+            logging.info("Let serivces boot up")
+            sleep(3)
+            return True
+        else:
+            sleep(1)
+    logging.error(f"Timed out wait for containers matching {name_filter}.")
+    return False
+
+
 # >>>
 
 
-- 
GitLab