4 anos atrás · 690c9dc32b
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
				 from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				 
			
 
				-__version__ = '0.9.3'
			
 
				+__version__ = '0.9.4'
			
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -6,6 +6,8 @@ import asyncio
 
				 import contextlib
			
 
				 import ctypes
			
 
				 import multiprocessing as mp
			
 
				+import threading
			
 
				+import weakref
			
 
				 from concurrent.futures.thread import ThreadPoolExecutor
			
 
				 from typing import Sequence, Optional, Tuple, Any, Union, Dict, AsyncIterator
			
 
				 
			
@@ -123,10 +125,12 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         self._port = mp.Value(ctypes.c_uint32, 0)  # assigned when averager starts, accessible via self.port
			
 
				         self._averager_endpoint: Optional[Endpoint] = None
			
 
				         self.ready = mp.Event()  # whether the averager process has started (and ready for incoming requests)
			
 
				-
			
 
				+        # note: we create a background thread weakref and with daemon=True to ensure garbage collection
			
 
				+        background_fetcher = threading.Thread(daemon=True, target=_background_thread_fetch_current_state,
			
 
				+                                              args=[self.pipe, weakref.WeakMethod(self.get_current_state)])
			
 
				+        background_fetcher.start()
			
 
				         if start:
			
 
				             self.run_in_background(await_ready=True)
			
 
				-            hivemind.run_in_background(self._background_thread_fetch_current_state_if_asked)
			
 
				 
			
 
				     @property
			
 
				     def port(self) -> Optional[Port]:
			
@@ -183,10 +187,15 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         """ Shut down the averager process """
			
 
				         # TODO notify peers before terminating
			
 
				         if self.is_alive():
			
 
				+            self._pipe.send(('_SHUTDOWN', None))
			
 
				             self.terminate()
			
 
				         else:
			
 
				             logger.warning("DHT shutdown has no effect: the process is not alive")
			
 
				 
			
 
				+    def __del__(self):
			
 
				+        if self.is_alive():
			
 
				+            self.shutdown()
			
 
				+
			
 
				     def step(self, gather: Optional[DataForGather] = None, allow_retries: bool = True, timeout: Optional[float] = None,
			
 
				              wait=True) -> Union[Optional[Dict[Endpoint, DataForGather]], MPFuture]:
			
 
				         """
			
@@ -331,23 +340,6 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         self._pipe.send(('_TRIGGER_GET_CURRENT_STATE', _future))
			
 
				         return await future
			
 
				 
			
 
				-    def _background_thread_fetch_current_state_if_asked(self):
			
 
				-        """ Executed in the host process as a background thread. """
			
 
				-        while True:
			
 
				-            trigger, future = self.pipe.recv()
			
 
				-            assert trigger == '_TRIGGER_GET_CURRENT_STATE'
			
 
				-            try:
			
 
				-                state_metadata, state_tensors = self.get_current_state()
			
 
				-                # note: we cast tensors to CPU on host side to avoid initializing cuda in the guest process
			
 
				-                assert isinstance(state_metadata, bytes)
			
 
				-                state_tensors = tuple(tensor.cpu().detach().requires_grad_(tensor.requires_grad)
			
 
				-                                      for tensor in state_tensors)
			
 
				-                future.set_result((state_metadata, state_tensors))
			
 
				-            except BaseException as e:
			
 
				-                future.set_exception(e)
			
 
				-                logger.warning(e)
			
 
				-                continue
			
 
				-
			
 
				     def load_state_from_peers(self, wait=True) -> Optional[Tuple[bytes, Sequence[torch.Tensor]]]:
			
 
				         """
			
 
				         Try to download the latest optimizer state one of the existing peer.
			
@@ -439,3 +431,33 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				 def is_power_of_two(n):
			
 
				     """ Check whether n is a power of 2 """
			
 
				     return (n != 0) and (n & (n - 1) == 0)
			
 
				+
			
 
				+
			
 
				+def _background_thread_fetch_current_state(pipe: mp.connection.Connection, get_current_state_ref: weakref.WeakMethod):
			
 
				+    """
			
 
				+    Executed in the host process as a background thread. Fetches the averager state when asked by peers.
			
 
				+    :param pipe: DecentralizedAverager's control pipe (from host process side)
			
 
				+    :param get_current_state_ref: a WeakMethod wrapped around DecentraliedAverager.get_current_state (instance-bound)
			
 
				+    """
			
 
				+    while True:
			
 
				+        trigger, future = pipe.recv()
			
 
				+        if trigger == '_SHUTDOWN':
			
 
				+            break
			
 
				+
			
 
				+        assert trigger == '_TRIGGER_GET_CURRENT_STATE'
			
 
				+        try:
			
 
				+            get_current_state = get_current_state_ref()
			
 
				+            if get_current_state is None:
			
 
				+                break
			
 
				+            state_metadata, state_tensors = get_current_state()
			
 
				+            del get_current_state
			
 
				+
			
 
				+            assert isinstance(state_metadata, bytes)
			
 
				+            state_tensors = tuple(tensor.cpu().detach().requires_grad_(tensor.requires_grad)
			
 
				+                                  for tensor in state_tensors)
			
 
				+            # note: we cast tensors to CPU on host side to avoid initializing cuda in the guest process
			
 
				+            future.set_result((state_metadata, state_tensors))
			
 
				+        except BaseException as e:
			
 
				+            future.set_exception(e)
			
 
				+            logger.warning(e)
			
 
				+            continue
			
--- a/hivemind/client/averaging/matchmaking.py
+++ b/hivemind/client/averaging/matchmaking.py
@@ -7,6 +7,7 @@ import random
 
				 from dataclasses import asdict
			
 
				 from math import isfinite
			
 
				 from typing import Sequence, Optional, AsyncIterator, Set, Tuple, Dict
			
 
				+import concurrent.futures
			
 
				 import asyncio
			
 
				 
			
 
				 import grpc
			
@@ -142,6 +143,8 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                         elif len(self.current_followers) > 0:
			
 
				                             await self.leader_disband_group()
			
 
				                         continue
			
 
				+                except (concurrent.futures.CancelledError, asyncio.CancelledError):
			
 
				+                    break  # note: this is a compatibility layer for python3.7
			
 
				                 except Exception as e:
			
 
				                     if not self.assembled_group.done():
			
 
				                         self.assembled_group.set_exception(e)
			
@@ -256,7 +259,8 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                 code=averaging_pb2.BEGIN_ALLREDUCE, group_id=allreduce_group.group_id,
			
 
				                 ordered_group_endpoints=allreduce_group.ordered_group_endpoints, part_sizes=allreduce_group.part_sizes,
			
 
				                 gathered=allreduce_group.gathered, group_key_seed=allreduce_group.group_key_seed)
			
 
				-
			
 
				+        except (concurrent.futures.CancelledError, asyncio.CancelledError):
			
 
				+            return  # note: this is a compatibility layer for python3.7
			
 
				         except Exception as e:
			
 
				             logger.exception(e)
			
 
				             yield averaging_pb2.MessageFromLeader(code=averaging_pb2.INTERNAL_ERROR)
			
@@ -445,6 +449,8 @@ class PotentialLeaders:
 
				                     {self.running.wait(), self.update_triggered.wait()}, return_when=asyncio.ALL_COMPLETED,
			
 
				                     timeout=self.search_end_time - get_dht_time() if isfinite(self.search_end_time) else None)
			
 
				                 self.update_triggered.clear()
			
 
				+        except (concurrent.futures.CancelledError, asyncio.CancelledError):
			
 
				+            return  # note: this is a compatibility layer for python3.7
			
 
				         except Exception as e:
			
 
				             logger.error(f"{self.endpoint} - caught {type(e)}: {e}")
			
 
				             raise
			
@@ -463,7 +469,8 @@ class PotentialLeaders:
 
				                     await asyncio.sleep(self.declared_expiration_time - get_dht_time())
			
 
				                     if self.running.is_set() and len(self.leader_queue) == 0:
			
 
				                         await key_manager.update_key_on_not_enough_peers()
			
 
				-
			
 
				+            except (concurrent.futures.CancelledError, asyncio.CancelledError):
			
 
				+                pass  # note: this is a compatibility layer for python3.7
			
 
				             except Exception as e:  # note: we catch exceptions here because otherwise they are never printed
			
 
				                 logger.error(f"{self.endpoint} - caught {type(e)}: {e}")
			
 
				             finally:
			
--- a/hivemind/utils/threading.py
+++ b/hivemind/utils/threading.py
@@ -12,7 +12,7 @@ def run_in_background(func: callable, *args, **kwargs) -> Future:
 
				     """ run func(*args, **kwargs) in background and return Future for its outputs """
			
 
				     global EXECUTOR_PID, GLOBAL_EXECUTOR
			
 
				     if os.getpid() != EXECUTOR_PID:
			
 
				-        GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=os.environ.get("HIVEMIND_THREADS", float('inf')))
			
 
				+        GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=float(os.environ.get("HIVEMIND_THREADS", 'inf')))
			
 
				         EXECUTOR_PID = os.getpid()
			
 
				     return GLOBAL_EXECUTOR.submit(func, *args, **kwargs)
			
 
				 
			
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -70,6 +70,10 @@ def test_allreduce_once():
 
				             for ref, our in zip(reference, averaged_tensors):
			
 
				                 assert torch.allclose(ref, our, atol=1e-6)
			
 
				 
			
 
				+    for averager in averagers:
			
 
				+        averager.shutdown()
			
 
				+    dht.shutdown()
			
 
				+
			
 
				 
			
 
				 def compute_mean_std(averagers, unbiased=True):
			
 
				     results = []
			
@@ -108,6 +112,10 @@ def test_allreduce_grid():
 
				         else:
			
 
				             assert torch.allclose(stds, torch.zeros_like(stds), atol=1e-6, rtol=0)
			
 
				 
			
 
				+    for averager in averagers:
			
 
				+        averager.shutdown()
			
 
				+    dht.shutdown()
			
 
				+
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				 def test_allgather():
			
@@ -133,6 +141,10 @@ def test_allgather():
 
				         for endpoint in gathered:
			
 
				             assert gathered[endpoint] == reference_metadata[endpoint]
			
 
				 
			
 
				+    for averager in averagers:
			
 
				+        averager.shutdown()
			
 
				+    dht.shutdown()
			
 
				+
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				 @pytest.mark.asyncio
			
@@ -249,6 +261,10 @@ def test_too_few_peers():
 
				     for future in step_futures:
			
 
				         assert len(future.result()) == 2
			
 
				 
			
 
				+    for averager in averagers:
			
 
				+        averager.shutdown()
			
 
				+    dht.shutdown()
			
 
				+
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				 def test_overcrowded():
			
@@ -262,6 +278,10 @@ def test_overcrowded():
 
				         step_futures = [averager.step(wait=False, timeout=5) for averager in averagers]
			
 
				         assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1
			
 
				 
			
 
				+    for averager in averagers:
			
 
				+        averager.shutdown()
			
 
				+    dht.shutdown()
			
 
				+
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				 def test_load_state_from_peers():