преди 4 години · 94b9db0d37
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -158,6 +158,16 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         return f"{self.__class__.__name__}({self.endpoint})"
														
 
															     def run(self):
														
 
															+        """
														
 
															+        Run averager function in a background thread; this is needed to avoid a heisenbug with broken OMP on fork
														
 
															+        Turns out, using a non-main thread creates a separate OMP pool that works even if the original pool is corrupted
														
 
															+        Read more: https://github.com/pytorch/pytorch/issues/17199
														
 
															+        """
														
 
															+        thread = threading.Thread(target=self._run_internal, daemon=True)
														
 
															+        thread.start()
														
 
															+        thread.join()
														
 
															+
														
 
															+    def _run_internal(self):
														
 
															         """ Serve DecentralizedAverager forever. This function will not return until the averager is shut down """
														
 
															         loop = switch_to_uvloop()
														
 
															         # initialize asyncio synchronization primitives in this event loop
														
@@ -240,41 +250,45 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         start_time = get_dht_time()
														
 
															         group_id = None
														
 
															-        while not future.done():
														
 
															-            try:
														
 
															-                self._pending_group_assembled.clear()
														
 
															-                data_for_gather = self.serializer.dumps([weight, self._throughput, self.listen, gather_binary])
														
 
															-                group_info = await self._matchmaking.look_for_group(timeout=timeout, data_for_gather=data_for_gather)
														
 
															-                if group_info is None:
														
 
															-                    raise AllreduceException("Averaging step failed: could not find a group.")
														
 
															-                group_id = group_info.group_id
														
 
															-                allreduce_runner = await self._make_allreduce_runner(group_info, **self.allreduce_kwargs)
														
 
															-                self._running_groups[group_id] = allreduce_runner
														
 
															-                self._pending_group_assembled.set()
														
 
															-                await asyncio.wait_for(allreduce_runner.run(), self._allreduce_timeout)
														
 
															-                await loop.run_in_executor(None, self.update_tensors, allreduce_runner)
														
 
															-
														
 
															-                # averaging is finished, exit the loop
														
 
															-                future.set_result(allreduce_runner.gathered)
														
 
															-
														
 
															-            except (AllreduceException, MatchmakingException, AssertionError, StopAsyncIteration, InternalError,
														
 
															-                    asyncio.CancelledError, asyncio.InvalidStateError, grpc.RpcError, grpc.aio.AioRpcError) as e:
														
 
															-                time_elapsed = get_dht_time() - start_time
														
 
															-                if not allow_retries or (timeout is not None and timeout < time_elapsed):
														
 
															-                    logger.exception(f"Averager caught {repr(e)}")
														
 
															-                    future.set_exception(e)
														
 
															-                else:
														
 
															-                    logger.warning(f"Averager caught {repr(e)}, retrying")
														
 
															+        try:
														
 
															+            while not future.done():
														
 
															+                try:
														
 
															+                    self._pending_group_assembled.clear()
														
 
															+                    data_for_gather = self.serializer.dumps([weight, self._throughput, self.listen, gather_binary])
														
 
															+                    group_info = await self._matchmaking.look_for_group(timeout=timeout, data_for_gather=data_for_gather)
														
 
															+                    if group_info is None:
														
 
															+                        raise AllreduceException("Averaging step failed: could not find a group.")
														
 
															+                    group_id = group_info.group_id
														
 
															+                    allreduce_runner = await self._make_allreduce_runner(group_info, **self.allreduce_kwargs)
														
 
															+                    self._running_groups[group_id] = allreduce_runner
														
 
															+                    self._pending_group_assembled.set()
														
 
															+                    await asyncio.wait_for(allreduce_runner.run(), self._allreduce_timeout)
														
 
															+                    await loop.run_in_executor(None, self.update_tensors, allreduce_runner)
														
 
															+
														
 
															+                    # averaging is finished, exit the loop
														
 
															+                    future.set_result(allreduce_runner.gathered)
														
 
															+
														
 
															+                except (AllreduceException, MatchmakingException, AssertionError, StopAsyncIteration, InternalError,
														
 
															+                        asyncio.CancelledError, asyncio.InvalidStateError, grpc.RpcError, grpc.aio.AioRpcError) as e:
														
 
															+                    time_elapsed = get_dht_time() - start_time
														
 
															+                    if not allow_retries or (timeout is not None and timeout < time_elapsed):
														
 
															+                        logger.exception(f"Averager caught {repr(e)}")
														
 
															+                        future.set_exception(e)
														
 
															+                    else:
														
 
															+                        logger.warning(f"Averager caught {repr(e)}, retrying")
														
 
															-            except BaseException as e:
														
 
															+                finally:
														
 
															+                    _ = self._running_groups.pop(group_id, None)
														
 
															+                    self._pending_group_assembled.set()
														
 
															+
														
 
															+        except BaseException as e:
														
 
															+            if not future.done():
														
 
															                 future.set_exception(e)
														
 
															-                raise
														
 
															-            finally:
														
 
															-                _ = self._running_groups.pop(group_id, None)
														
 
															-                self._pending_group_assembled.set()
														
 
															-                if not future.done():
														
 
															-                    future.set_exception(RuntimeError("Internal sanity check failed: averager.step left future pending."
														
 
															-                                                      " Please report this to hivemind issues."))
														
 
															+            raise
														
 
															+        finally:
														
 
															+            if not future.done():
														
 
															+                future.set_exception(RuntimeError("Internal sanity check failed: averager.step left future pending."
														
 
															+                                                  " Please report this to hivemind issues."))
														
 
															     async def _make_allreduce_runner(self, group_info: GroupInfo, min_vector_size: int, **kwargs) -> AllReduceRunner:
														
 
															         """ Use a group description found by Matchmaking to form AllreduceRunner """
														
--- a/hivemind/client/averaging/matchmaking.py
+++ b/hivemind/client/averaging/matchmaking.py
@@ -10,6 +10,7 @@ import concurrent.futures
 
															 import asyncio
														
 
															 import grpc
														
 
															+import grpc._cython.cygrpc
														
 
															 from hivemind.client.averaging.group_info import GroupInfo
														
 
															 from hivemind.client.averaging.key_manager import GroupKeyManager, GroupKey
														
@@ -199,6 +200,10 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															             if call is not None:
														
 
															                 call.cancel()
														
 
															             return None
														
 
															+        except (grpc.RpcError, grpc.aio.AioRpcError, grpc._cython.cygrpc.InternalError, StopAsyncIteration) as e:
														
 
															+            logger.error(f"{self} - failed to request potential leader {leader}: {e}")
														
 
															+            return None
														
 
															+
														
 
															         finally:
														
 
															             self.was_accepted_to_group.clear()
														
 
															             self.current_leader = None