Explorar el Código

load state timeout

justheuristic hace 4 años
padre
commit
97bd64fc7e
Se han modificado 2 ficheros con 3 adiciones y 2 borrados
  1. 1 1
      hivemind/averaging/averager.py
  2. 2 1
      hivemind/optim/collaborative.py

+ 1 - 1
hivemind/averaging/averager.py

@@ -616,7 +616,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
         finally:
             if not future.done():
-                logger.warning("Averager could not load state from peers: all requests have failed.")
+                logger.warning("Averager could not load state from peers: none of the requests succeeded.")
                 future.set_result(None)
 
     def get_group_bits(self, wait: bool = True):

+ 2 - 1
hivemind/optim/collaborative.py

@@ -191,8 +191,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
             while True:
                 try:
                     self.averager.load_state_from_peers(timeout=self.load_state_timeout, **kwargs)
+                    break
                 except BaseException as e:
-                    logger.exception(f"Failed to load state from peers: {e}, will retry now")
+                    logger.exception(f"Failed to load state from peers: {e}, retrying ...")
                     continue
 
             self.local_samples_accumulated = self.local_steps_accumulated = 0