4 лет назад · 10917b259e
--- a/docs/modules/client.rst
+++ b/docs/modules/client.rst
@@ -16,4 +16,9 @@
 
				 
			
 
				 .. autoclass:: RemoteMixtureOfExperts
			
 
				    :members:
			
 
				-   :member-order: bysource
			
 
				+   :member-order: bysource
			
 
				+
			
 
				+.. autoclass:: DecentralizedAverager
			
 
				+   :members:
			
 
				+   :member-order: bysource
			
 
				+   :exclude-members: get_tensors, update_tensors, rpc_join_group, rpc_aggregate_part
			
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
				 from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				 
			
 
				-__version__ = '0.8.26'
			
 
				+__version__ = '0.8.27'
			
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -6,7 +6,6 @@ import asyncio
 
				 import contextlib
			
 
				 import ctypes
			
 
				 import multiprocessing as mp
			
 
				-import random
			
 
				 from concurrent.futures.thread import ThreadPoolExecutor
			
 
				 from typing import Sequence, Optional, Tuple, Any, Union, Dict, AsyncIterator
			
 
				 
			
@@ -16,7 +15,7 @@ import numpy as np
 
				 
			
 
				 import hivemind
			
 
				 from hivemind.client.averaging.allreduce import AllReduceRunner, AllreduceException, GroupID
			
 
				-from hivemind.client.averaging.matchmaking import Matchmaking
			
 
				+from hivemind.client.averaging.matchmaking import Matchmaking, MatchmakingException
			
 
				 from hivemind.proto import averaging_pb2, averaging_pb2_grpc, runtime_pb2
			
 
				 from hivemind.utils import get_logger, Endpoint, Port, MPFuture, GRPC_KEEPALIVE_OPTIONS, get_dht_time, MSGPackSerializer
			
 
				 from hivemind.utils.asyncio import anext, achain, aiter, switch_to_uvloop
			
@@ -24,7 +23,6 @@ from hivemind.utils.asyncio import anext, achain, aiter, switch_to_uvloop
 
				 # flavour types
			
 
				 StreamCallToLeader = grpc.aio.UnaryStreamCall[averaging_pb2.JoinRequest, averaging_pb2.MessageFromLeader]
			
 
				 
			
 
				-INITIAL_GROUP_NBITS = 3
			
 
				 DataForGather = Any
			
 
				 logger = get_logger(__name__)
			
 
				 
			
@@ -43,8 +41,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				 
			
 
				     :param prefix: a shared prefix for all group keys
			
 
				     :param target_group_size: attempts to form groups with up to this many peers (recommended: a power of 2, e.g. 16)
			
 
				-    :param initial_group_bits: a string of bits ('0' and '1') that define initial group key (bucket index)
			
 
				-      by default, sample a random bit sequence of length {INITIAL_GROUP_NBITS}
			
 
				+    :param initial_group_bits: a string of bits ('0' and '1') that define the initial group key (bucket index)
			
 
				     :param averaging_expiration: attempt to find a group for this many seconds, otherwise try again
			
 
				       note - this expiration time only applies to looking for group, passing tensors in allreduce may take more time
			
 
				     :param compression_type: optionally compress tensors with this compression algorithm before sending them to peers
			
@@ -56,7 +53,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				     :param chunk_size_bytes: tensors for AllReduce will be divided into chunks of this size (to improve gRPC throughput)
			
 
				     :param throughput: if specified, this value represents the network bandwidth available to averager.
			
 
				           By default, the averager is assumed to have the average bandwidth of his group.
			
 
				-          If throughput == 0, averager will run in client-only mode (TODO not implemented yet!)
			
 
				+          If throughput == 0, averager will rely on its groupmates to do all the averaging.
			
 
				     :param listen: if True (default), this averager will accept incoming requests from other peers and perform allreduce
			
 
				             if False, the averager will register as a freeloader and attempt to fetch vectors from other averagers
			
 
				     :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
			
@@ -64,9 +61,18 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				     :param channel_options: options for grpc.aio.insecure_channel, e.g. [('grpc.enable_retries', 0)]
			
 
				           see https://grpc.github.io/grpc/core/group__grpc__arg__keys.html for a list of all options
			
 
				     :param kwargs: extra parameters forwarded to grpc.aio.server
			
 
				-    You can perform averaging using DecentralizedOptimizer (see below) or by manually running each step as such:
			
 
				 
			
 
				-    >> TODO add a working example here
			
 
				+    Example:
			
 
				+
			
 
				+    >>> averager = DecentralizedAverager(...)
			
 
				+    >>> with averager.get_tensors() as tensors:
			
 
				+    >>>     # run some code, modify tensors if necessary
			
 
				+    >>>     tensors[0] += 1
			
 
				+    >>> # do not use tensors after the lock is released
			
 
				+    >>> metadata = averager.step(gather=dict(my_batch_size=32))
			
 
				+    >>> # run averaging once (in-place), gather metadata from groupmates
			
 
				+    >>> with averager.get_tensors() as tensors_after_averaging:
			
 
				+    >>>     pass # use the averaged tensors
			
 
				     """
			
 
				     _matchmaking: Matchmaking
			
 
				     _pending_group_assembled: asyncio.Event
			
@@ -81,16 +87,13 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				                  listen: bool = True, listen_on: Endpoint = '0.0.0.0:*', receiver_threads: int = 1, daemon: bool = True,
			
 
				                  channel_options: Optional[Sequence[Tuple[str, Any]]] = None, **kwargs):
			
 
				         assert '.' not in prefix, "group prefix must be a string without trailing '.'"
			
 
				-        assert throughput is None or (throughput >= 0 and np.isfinite(np.float32(throughput))), "throughput must be a" \
			
 
				-                                                                                                " nonnegative float32"
			
 
				+        assert throughput is None or (throughput >= 0 and np.isfinite(np.float32(throughput))), \
			
 
				+            "throughput must be a non-negative float32"
			
 
				         if not listen:
			
 
				             raise NotImplementedError("Client-only averaging is not implemented yet.")
			
 
				         if not is_power_of_two(target_group_size):
			
 
				             logger.warning("It is recommended to set target_group_size to a power of 2.")
			
 
				-        if initial_group_bits is None:
			
 
				-            initial_group_bits = ''.join(random.choices('01', k=INITIAL_GROUP_NBITS))
			
 
				-            logger.debug(f"Initializing with random {INITIAL_GROUP_NBITS}-bit group index: {initial_group_bits}")
			
 
				-        assert len(initial_group_bits) >= INITIAL_GROUP_NBITS and all(bit in '01' for bit in initial_group_bits)
			
 
				+        assert initial_group_bits is None or all(bit in '01' for bit in initial_group_bits)
			
 
				 
			
 
				         super().__init__()
			
 
				         self.dht = dht
			
@@ -178,7 +181,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         else:
			
 
				             logger.warning("DHT shutdown has no effect: the process is not alive")
			
 
				 
			
 
				-    def step(self, allow_retries: bool = True, gather: Optional[DataForGather] = None, timeout: Optional[float] = None,
			
 
				+    def step(self, gather: Optional[DataForGather] = None, allow_retries: bool = True, timeout: Optional[float] = None,
			
 
				              wait=True) -> Union[Optional[Dict[Endpoint, DataForGather]], MPFuture]:
			
 
				         """
			
 
				         Set up the averager to look for a group and run one round of averaging, return True on success, False on failure
			
@@ -219,7 +222,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				                 gathered_data_by_peer = dict(zip(allreduce_group.ordered_group_endpoints, gathered_items))
			
 
				                 future.set_result(gathered_data_by_peer)
			
 
				 
			
 
				-            except AllreduceException:
			
 
				+            except (AllreduceException, MatchmakingException):
			
 
				                 time_elapsed = get_dht_time() - start_time
			
 
				                 if not allow_retries or (timeout is not None and timeout < time_elapsed):
			
 
				                     future.set_result(None)
			
@@ -248,12 +251,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         """
			
 
				         A contextmanager that gives user access to averaged tensors.
			
 
				         It is guaranteed that the averager will not modify tensors while this context is active.
			
 
				-
			
 
				-        Example:
			
 
				-              >>> with averager.get_tensors() as tensors:
			
 
				-              >>>     update_model(tensors)
			
 
				-              >>>     tensors[0] += 1
			
 
				-              >>> # do not use tensors after the lock is acquired
			
 
				+        Please do not modify the yielded tensors in-place after the context is released.
			
 
				         """
			
 
				         with self.lock_averaged_tensors:
			
 
				             yield self._averaged_tensors
			
--- a/hivemind/client/averaging/allreduce.py
+++ b/hivemind/client/averaging/allreduce.py
@@ -123,12 +123,13 @@ class AllReduceRunner(AllReduceProtocol, averaging_pb2_grpc.DecentralizedAveragi
 
				 
			
 
				     def __init__(self, *, group_id: GroupID, tensors: Sequence[torch.Tensor], endpoint: Endpoint,
			
 
				                  ordered_group_endpoints: Sequence[Endpoint], compression_type: runtime_pb2.CompressionType,
			
 
				-                 chunk_size_bytes: int, part_sizes: Tuple[int, ...], gathered: Sequence[Any] = (),
			
 
				+                 chunk_size_bytes: int, part_sizes: Tuple[int, ...], group_key_seed: int, gathered: Sequence[Any] = (),
			
 
				                  return_deltas: bool = False):
			
 
				         super().__init__(group_id=group_id, tensors=tensors, endpoint=endpoint, part_sizes=part_sizes,
			
 
				                          ordered_group_endpoints=ordered_group_endpoints, return_deltas=return_deltas)
			
 
				         self.compression_type, self.chunk_size_bytes, self.gathered = compression_type, chunk_size_bytes, gathered
			
 
				         self.averaged_part_stream: asyncio.Future[Tuple[runtime_pb2.Tensor, ...]] = asyncio.Future()
			
 
				+        self.group_key_seed = group_key_seed
			
 
				 
			
 
				     def _get_peer_stub(self, peer: Endpoint) -> averaging_pb2_grpc.DecentralizedAveragingStub:
			
 
				         return ChannelCache.get_stub(peer, averaging_pb2_grpc.DecentralizedAveragingStub, aio=True)
			
--- a/hivemind/client/averaging/key_manager.py
+++ b/hivemind/client/averaging/key_manager.py
@@ -0,0 +1,142 @@
 
				+import asyncio
			
 
				+import re
			
 
				+import random
			
 
				+from typing import Optional, List, Tuple
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+from hivemind.dht import DHT
			
 
				+from hivemind.client.averaging.allreduce import AllReduceRunner
			
 
				+from hivemind.utils import get_logger, Endpoint, DHTExpiration, get_dht_time, ValueWithExpiration
			
 
				+
			
 
				+GroupKey = str
			
 
				+GROUP_PATTERN = re.compile('^(([^.])+)[.]0b[01]*$')  # e.g. bert_exp4_averaging.0b01001101
			
 
				+logger = get_logger(__name__)
			
 
				+
			
 
				+
			
 
				+def is_valid_group(maybe_group: str) -> bool:
			
 
				+    """ A group identifier must contain group type, followed by one or more .-separated indices, and any ?metadata"""
			
 
				+    return bool(GROUP_PATTERN.fullmatch(maybe_group))
			
 
				+
			
 
				+
			
 
				+class GroupKeyManager:
			
 
				+    """
			
 
				+    Utility class that declares and fetches averaging-related keys using a DHT
			
 
				+    """
			
 
				+    RESERVED_KEY_FOR_NBITS = '::NBITS'
			
 
				+
			
 
				+    def __init__(self, dht: DHT, endpoint: Endpoint, prefix: str, initial_group_bits: Optional[str],
			
 
				+                 target_group_size: int, insufficient_size: Optional[int] = None, excessive_size: Optional[int] = None,
			
 
				+                 nbits_expiration: float = 60):
			
 
				+        assert initial_group_bits is None or all(bit in '01' for bit in initial_group_bits)
			
 
				+        if initial_group_bits is None:
			
 
				+            search_result = dht.get(f"{prefix}.0b", latest=True)
			
 
				+            initial_group_bits = self.get_suggested_nbits(search_result) or ''
			
 
				+        self.dht, self.endpoint, self.prefix, self.group_bits = dht, endpoint, prefix, initial_group_bits
			
 
				+        self.target_group_size = target_group_size
			
 
				+        self.insufficient_size = insufficient_size or max(1, target_group_size // 2)
			
 
				+        self.excessive_size = excessive_size or target_group_size * 3
			
 
				+        self.nbits_expiration = nbits_expiration
			
 
				+        self.suggested_nbits: Optional[int] = None
			
 
				+
			
 
				+    @property
			
 
				+    def current_key(self) -> GroupKey:
			
 
				+        return f"{self.prefix}.0b{self.group_bits}"
			
 
				+
			
 
				+    async def declare_averager(self, group_key: GroupKey, endpoint: Endpoint, expiration_time: float,
			
 
				+                               looking_for_group: bool = True) -> bool:
			
 
				+        """
			
 
				+        Add (or remove) the averager to a given allreduce bucket
			
 
				+
			
 
				+        :param group_key: allreduce group key, e.g. my_averager.0b011011101
			
 
				+        :param endpoint: averager public endpoint for incoming requests
			
 
				+        :param expiration_time: intent to run allreduce before this timestamp
			
 
				+        :param looking_for_group: by default (True), declare the averager as "looking for group" in a given group;
			
 
				+          If False, this will instead mark that the averager as no longer looking for group, (e.g. it already finished)
			
 
				+        :return: True if declared, False if declaration was rejected by DHT peers
			
 
				+        :note: when leaving (i.e. is_active=False), please specify the same expiration_time as when entering the group
			
 
				+        :note: setting is_active=False does *not* guarantee that others will immediately stop to query you.
			
 
				+        """
			
 
				+        expiration_time = expiration_time if looking_for_group else float(np.nextafter(expiration_time, float('inf')))
			
 
				+        return await self.dht.store(key=group_key, subkey=endpoint, value=looking_for_group,
			
 
				+                                    expiration_time=expiration_time, return_future=True)
			
 
				+
			
 
				+    async def get_averagers(self, group_key: GroupKey, only_active: bool) -> List[Tuple[Endpoint, DHTExpiration]]:
			
 
				+        """
			
 
				+        Find and return averagers that were declared with a given all-reduce key
			
 
				+
			
 
				+        :param group_key: finds averagers that have the this group key, e.g. my_averager.0b011011101
			
 
				+        :param only_active: if True, return only active averagers that are looking for group (i.e. with value = True)
			
 
				+            if False, return all averagers under a given group_key regardless of value
			
 
				+        :return: endpoints and expirations of every matching averager
			
 
				+        """
			
 
				+        assert is_valid_group(group_key), f"Group key {group_key} is invalid, must follow {GROUP_PATTERN}"
			
 
				+        result = await self.dht.get(group_key, latest=True, return_future=True)
			
 
				+        if result is None or not isinstance(result.value, dict):
			
 
				+            logger.debug(f"Allreduce group not found: {group_key}, creating new group.")
			
 
				+            return []
			
 
				+        averagers = [(key, entry.expiration_time) for key, entry in result.value.items()
			
 
				+                     if key != self.RESERVED_KEY_FOR_NBITS and (not only_active or entry.value is True)]
			
 
				+        num_active_averagers = len([key for key, entry in result.value.items() if entry.value is True])
			
 
				+
			
 
				+        suggested_nbits = self.get_suggested_nbits(result)
			
 
				+        if suggested_nbits is not None and suggested_nbits != self.suggested_nbits:
			
 
				+            self.suggested_nbits = suggested_nbits
			
 
				+            logger.warning(f"{self.endpoint} - another averager suggested {self.suggested_nbits}-bit keys")
			
 
				+        elif num_active_averagers >= self.excessive_size:
			
 
				+            self.suggested_nbits = max(suggested_nbits or 0, len(self.group_bits) + 1)
			
 
				+            logger.warning(f"{self.endpoint} - too many peers in bucket, switching to {self.suggested_nbits}-bit keys")
			
 
				+        return averagers
			
 
				+
			
 
				+    async def declare_nbits(self, group_key: GroupKey, nbits: int, expiration_time: DHTExpiration) -> bool:
			
 
				+        """ notify other peers that they can run averaging at this depth """
			
 
				+        return await self.dht.store(key=group_key, subkey=self.RESERVED_KEY_FOR_NBITS, value=nbits,
			
 
				+                                    expiration_time=expiration_time, return_future=True)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_suggested_nbits(cls, search_result: Optional[ValueWithExpiration]) -> Optional[int]:
			
 
				+        if isinstance(search_result, ValueWithExpiration) and cls.RESERVED_KEY_FOR_NBITS in search_result.value \
			
 
				+                and isinstance(search_result.value[cls.RESERVED_KEY_FOR_NBITS].value, int):
			
 
				+            return search_result.value[cls.RESERVED_KEY_FOR_NBITS].value
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    async def update_key_on_group_assembled(self, allreduce_group: AllReduceRunner, is_leader: bool = True):
			
 
				+        """ this function is triggered every time an averager finds an allreduce group """
			
 
				+        rng = random.Random(allreduce_group.group_key_seed)
			
 
				+        index = allreduce_group.ordered_group_endpoints.index(self.endpoint)
			
 
				+        generalized_index = rng.sample(range(self.target_group_size), allreduce_group.group_size)[index]
			
 
				+        nbits = int(np.ceil(np.log2(self.target_group_size)))
			
 
				+        new_bits = bin(generalized_index)[2:].rjust(nbits, '0')
			
 
				+        self.group_bits = (self.group_bits + new_bits)[-len(self.group_bits):]
			
 
				+        logger.debug(f"{self.endpoint} - updated group key to {self.group_bits}")
			
 
				+
			
 
				+        if is_leader and self.insufficient_size < allreduce_group.group_size < self.excessive_size:
			
 
				+            asyncio.create_task(self.notify_stragglers_on_success())
			
 
				+        if self.suggested_nbits is not None and self.suggested_nbits != len(self.group_bits):
			
 
				+            num_extra_bits = max(0, self.suggested_nbits - len(self.group_bits))
			
 
				+            self.group_bits = ''.join((random.choice('01') for _ in range(num_extra_bits))) + self.group_bits
			
 
				+            self.group_bits = self.group_bits[-self.suggested_nbits:]
			
 
				+        self.suggested_nbits = None
			
 
				+
			
 
				+    async def update_key_on_not_enough_peers(self):
			
 
				+        """ this function is triggered whenever averager fails to assemble group within timeout """
			
 
				+        new_nbits = self.suggested_nbits if self.suggested_nbits is not None else len(self.group_bits) - 1
			
 
				+        prev_nbits, self.group_bits = self.group_bits, self.group_bits[-new_nbits:]
			
 
				+        if self.group_bits != prev_nbits:
			
 
				+            logger.warning(f'{self.endpoint} - switching to {len(self.group_bits)}-bit keys')
			
 
				+        self.suggested_nbits = None
			
 
				+
			
 
				+    async def notify_stragglers_on_success(self):
			
 
				+        """ Find averagers that have fewer nbits and redirect them to your current nbits """
			
 
				+        for nbits in reversed(range(1, len(self.group_bits) - 1)):
			
 
				+            preceding_key = f"{self.prefix}.0b{self.group_bits[-nbits:] if nbits else ''}"
			
 
				+            preceding_data, _ = await self.dht.get(preceding_key, latest=False, return_future=True) or ({}, None)
			
 
				+
			
 
				+            if len(preceding_data) > 0 and self.RESERVED_KEY_FOR_NBITS not in preceding_data:
			
 
				+                await self.declare_nbits(preceding_key, len(self.group_bits), get_dht_time() + self.nbits_expiration)
			
 
				+                break
			
 
				+
			
 
				+        root_data = await self.dht.get(f"{self.prefix}.0b", latest=False, return_future=True)
			
 
				+        if root_data is None or self.RESERVED_KEY_FOR_NBITS not in root_data.value:
			
 
				+            await self.declare_nbits(f"{self.prefix}.0b", len(self.group_bits), get_dht_time() + self.nbits_expiration)
			
--- a/hivemind/client/averaging/matchmaking.py
+++ b/hivemind/client/averaging/matchmaking.py
@@ -12,14 +12,15 @@ import asyncio
 
				 import grpc
			
 
				 import torch
			
 
				 
			
 
				-import hivemind
			
 
				 from hivemind.client.averaging.allreduce import AllReduceRunner
			
 
				 from hivemind.client.averaging.load_balancing import load_balance_peers
			
 
				-from hivemind.dht import DHTID, DHTExpiration, get_dht_time, GroupKey
			
 
				-from hivemind.utils import get_logger, Endpoint, TensorDescriptor, MSGPackSerializer, TimedStorage
			
 
				+from hivemind.client.averaging.key_manager import GroupKeyManager, GroupKey
			
 
				+from hivemind.dht import DHT, DHTID, DHTExpiration, get_dht_time
			
 
				+from hivemind.utils import get_logger, Endpoint, TensorDescriptor, MSGPackSerializer, timed_storage, TimedStorage
			
 
				 from hivemind.proto import averaging_pb2, averaging_pb2_grpc
			
 
				 from hivemind.utils.grpc import ChannelCache
			
 
				 
			
 
				+
			
 
				 logger = get_logger(__name__)
			
 
				 
			
 
				 
			
@@ -36,7 +37,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				       Hence, instead of accounting for such deadlocks, we simply break them with request_timeout.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, endpoint: Endpoint, averaged_tensors: Sequence[torch.Tensor], dht: hivemind.dht.DHT, *,
			
 
				+    def __init__(self, endpoint: Endpoint, averaged_tensors: Sequence[torch.Tensor], dht: DHT, *,
			
 
				                  prefix: str, target_group_size: int, min_group_size: int, initial_group_bits: Optional[str] = None,
			
 
				                  averaging_expiration: float = 15, request_timeout: float, throughput: Optional[float] = None,
			
 
				                  min_vector_size: int, **allreduce_kwargs):
			
@@ -46,12 +47,11 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                            "matchmaking can cause deadlocks in some rare cases. Please see Matchmaking docstring.")
			
 
				 
			
 
				         super().__init__()
			
 
				-        self.dht, self.endpoint, self.averaged_tensors = dht, endpoint, tuple(averaged_tensors)
			
 
				-        self.prefix, self.group_bits = prefix, initial_group_bits
			
 
				+        self.endpoint, self.averaged_tensors = endpoint, tuple(averaged_tensors)
			
 
				+        self.group_key_manager = GroupKeyManager(dht, endpoint, prefix, initial_group_bits, target_group_size)
			
 
				         self.target_group_size, self.min_group_size = target_group_size, min_group_size
			
 
				         self.averaging_expiration, self.request_timeout = averaging_expiration, request_timeout
			
 
				-        self.throughput, self.min_vector_size = throughput, min_vector_size
			
 
				-        self.allreduce_kwargs = allreduce_kwargs
			
 
				+        self.throughput, self.min_vector_size, self.allreduce_kwargs = throughput, min_vector_size, allreduce_kwargs
			
 
				         self.schema_hash = compute_schema_hash(self.averaged_tensors)
			
 
				         self.total_size = sum(tensor.numel() for tensor in self.averaged_tensors)
			
 
				 
			
@@ -63,17 +63,13 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				 
			
 
				         self.current_leader: Optional[Endpoint] = None  # iff i am a follower, this is a link to my current leader
			
 
				         self.current_followers: Dict[Endpoint, averaging_pb2.JoinRequest] = {}  # my current followers excluding myself
			
 
				-        self.potential_leaders = PotentialLeaders(endpoint, dht, averaging_expiration, target_group_size)
			
 
				-        self.data_for_gather: bytes = None
			
 
				+        self.potential_leaders = PotentialLeaders(endpoint, averaging_expiration, target_group_size)
			
 
				+        self.data_for_gather: Optional[bytes] = None
			
 
				 
			
 
				     @property
			
 
				     def is_looking_for_group(self):
			
 
				         return self.lock_looking_for_group.locked()
			
 
				 
			
 
				-    @property
			
 
				-    def current_group_key(self) -> GroupKey:
			
 
				-        return f"{self.prefix}.0b{self.group_bits}"
			
 
				-
			
 
				     def __repr__(self):
			
 
				         lfg_status = "looking for group," if self.is_looking_for_group else "not looking for group,"
			
 
				         if self.is_looking_for_group:
			
@@ -83,12 +79,12 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                 lfg_status += f" leading {len(self.current_followers)} followers,"
			
 
				         schema_hash_repr = f"{self.schema_hash[0]}...{self.schema_hash[-8:]}"
			
 
				         return f"{self.__class__.__name__}(endpoint={self.endpoint}, schema={schema_hash_repr}, {lfg_status}" \
			
 
				-               f" current key = {self.current_group_key})"
			
 
				+               f" current key = {self.group_key_manager.current_key})"
			
 
				 
			
 
				     async def look_for_group(self, *, data_for_gather: bytes = b'', timeout: Optional[float] = None
			
 
				                              ) -> Optional[AllReduceRunner]:
			
 
				         """
			
 
				-        :param gather: optionally send this data to all peers in the next group and gather it from every groupmate
			
 
				+        :param data_for_gather: optionally send this data to all peers in the next group and gather it from groupmates
			
 
				         :param timeout: maximum time that may be spent looking for group (does not include allreduce itself)
			
 
				         :returns: an assembled group if successful, None if failed; does NOT perform the actual averaging
			
 
				         Iterate over the averagers from a given group_identifier that have higher leadership priority than yourself.
			
@@ -127,10 +123,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				 
			
 
				     async def _request_join_potential_leaders(self, timeout: Optional[float]) -> AllReduceRunner:
			
 
				         """ Request leaders from queue until we find the first runner. This coroutine is meant to run in background. """
			
 
				-        async with self.potential_leaders.begin_search(self.current_group_key, timeout):
			
 
				-            # TODO update group_bits on success! reduce number of bits on not enough peers.
			
 
				-            # TODO after allreduce finishes, we may need to ask leader to notify lower keys about this
			
 
				-            # (so as to fix possible network partitioning if some peers operate on a much smaller nbits)
			
 
				+        async with self.potential_leaders.begin_search(self.group_key_manager, timeout):
			
 
				             while True:
			
 
				                 try:
			
 
				                     next_leader = await self.potential_leaders.pop_next_leader()  # throws TimeoutError on expiration
			
@@ -148,7 +141,6 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                             return await self.leader_assemble_group()
			
 
				                         elif len(self.current_followers) > 0:
			
 
				                             await self.leader_disband_group()
			
 
				-                            # TODO maybe adjust grid size
			
 
				                         continue
			
 
				                 except Exception as e:
			
 
				                     if not self.assembled_group.done():
			
@@ -262,8 +254,8 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				             allreduce_group = self.assembled_group.result()
			
 
				             yield averaging_pb2.MessageFromLeader(
			
 
				                 code=averaging_pb2.BEGIN_ALLREDUCE, group_id=allreduce_group.group_id,
			
 
				-                ordered_group_endpoints=allreduce_group.ordered_group_endpoints,
			
 
				-                part_sizes=allreduce_group.part_sizes, gathered=allreduce_group.gathered)
			
 
				+                ordered_group_endpoints=allreduce_group.ordered_group_endpoints, part_sizes=allreduce_group.part_sizes,
			
 
				+                gathered=allreduce_group.gathered, group_key_seed=allreduce_group.group_key_seed)
			
 
				 
			
 
				         except Exception as e:
			
 
				             logger.exception(e)
			
@@ -319,11 +311,13 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				                 gathered.append(follower_info.gather if follower_info.gather else None)
			
 
				 
			
 
				         part_sizes = load_balance_peers(self.total_size, throughputs, self.min_vector_size)
			
 
				+        group_key_seed = random.randint(- 2 ** 31, 2 ** 31 - 1)
			
 
				 
			
 
				         logger.debug(f"{self.endpoint} - leader started allreduce for {len(ordered_group_endpoints)} peers.")
			
 
				         allreduce_group = AllReduceRunner(group_id=group_id, tensors=self.averaged_tensors, endpoint=self.endpoint,
			
 
				                                           ordered_group_endpoints=ordered_group_endpoints, part_sizes=part_sizes,
			
 
				-                                          gathered=gathered, **self.allreduce_kwargs)
			
 
				+                                          gathered=gathered, group_key_seed=group_key_seed, **self.allreduce_kwargs)
			
 
				+        await self.group_key_manager.update_key_on_group_assembled(allreduce_group, is_leader=True)
			
 
				         self.assembled_group.set_result(allreduce_group)
			
 
				         return allreduce_group
			
 
				 
			
@@ -340,7 +334,9 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				         logger.debug(f"{self.endpoint} - follower started allreduce after being prompted by leader {leader}.")
			
 
				         allreduce_group = AllReduceRunner(group_id=group_id, tensors=self.averaged_tensors, endpoint=self.endpoint,
			
 
				                                           ordered_group_endpoints=tuple(ordered_group_endpoints),
			
 
				-                                          part_sizes=tuple(part_sizes), gathered=msg.gathered, **self.allreduce_kwargs)
			
 
				+                                          part_sizes=tuple(part_sizes), gathered=msg.gathered,
			
 
				+                                          group_key_seed=int(msg.group_key_seed), **self.allreduce_kwargs)
			
 
				+        await self.group_key_manager.update_key_on_group_assembled(allreduce_group)
			
 
				         self.assembled_group.set_result(allreduce_group)
			
 
				         return allreduce_group
			
 
				 
			
@@ -353,9 +349,8 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
				 class PotentialLeaders:
			
 
				     """ An utility class that searches for averagers that could become our leaders """
			
 
				 
			
 
				-    def __init__(self, endpoint: Endpoint, dht: hivemind.DHT, averaging_expiration: DHTExpiration,
			
 
				-                 target_group_size: Optional[int]):
			
 
				-        self.endpoint, self.dht, self.averaging_expiration = endpoint, dht, averaging_expiration
			
 
				+    def __init__(self, endpoint: Endpoint, averaging_expiration: DHTExpiration, target_group_size: Optional[int]):
			
 
				+        self.endpoint, self.averaging_expiration = endpoint, averaging_expiration
			
 
				         self.target_group_size = target_group_size
			
 
				         self.running, self.update_triggered, self.update_finished = asyncio.Event(), asyncio.Event(), asyncio.Event()
			
 
				         self.declared_expiration, self.lock_search, self.lock_declare = asyncio.Event(), asyncio.Lock(), asyncio.Lock()
			
@@ -367,12 +362,12 @@ class PotentialLeaders:
 
				         self.search_end_time = float('inf')
			
 
				 
			
 
				     @contextlib.asynccontextmanager
			
 
				-    async def begin_search(self, group_key: GroupKey, timeout: Optional[float]):
			
 
				+    async def begin_search(self, key_manager: GroupKeyManager, timeout: Optional[float]):
			
 
				         async with self.lock_search:
			
 
				             self.running.set()
			
 
				             self.search_end_time = get_dht_time() + timeout if timeout is not None else float('inf')
			
 
				-            update_queue_task = asyncio.create_task(self._update_queue_periodically(group_key))
			
 
				-            declare_averager_task = asyncio.create_task(self._declare_averager_periodically(group_key))
			
 
				+            update_queue_task = asyncio.create_task(self._update_queue_periodically(key_manager))
			
 
				+            declare_averager_task = asyncio.create_task(self._declare_averager_periodically(key_manager))
			
 
				             try:
			
 
				                 yield self
			
 
				             finally:
			
@@ -429,38 +424,46 @@ class PotentialLeaders:
 
				         else:
			
 
				             return min(get_dht_time() + self.averaging_expiration, self.search_end_time)
			
 
				 
			
 
				-    async def _update_queue_periodically(self, group_key: GroupKey):
			
 
				-        DISCREPANCY = hivemind.utils.timed_storage.MAX_DHT_TIME_DISCREPANCY_SECONDS
			
 
				-        while get_dht_time() < self.search_end_time:
			
 
				-            new_peers = await self.dht.get_averagers(group_key, only_active=True, return_future=True)
			
 
				-            self.max_assured_time = max(self.max_assured_time, get_dht_time() + self.averaging_expiration - DISCREPANCY)
			
 
				-
			
 
				-            self.leader_queue.clear()
			
 
				-            for peer, peer_expiration_time in new_peers:
			
 
				-                if peer == self.endpoint or (peer, peer_expiration_time) in self.past_attempts:
			
 
				-                    continue
			
 
				-                self.leader_queue.store(peer, peer_expiration_time, peer_expiration_time)
			
 
				-                self.max_assured_time = max(self.max_assured_time, peer_expiration_time - DISCREPANCY)
			
 
				+    async def _update_queue_periodically(self, key_manager: GroupKeyManager):
			
 
				+        try:
			
 
				+            DISCREPANCY = timed_storage.MAX_DHT_TIME_DISCREPANCY_SECONDS
			
 
				+            while get_dht_time() < self.search_end_time:
			
 
				+                new_peers = await key_manager.get_averagers(key_manager.current_key, only_active=True)
			
 
				+                self.max_assured_time = max(self.max_assured_time,
			
 
				+                                            get_dht_time() + self.averaging_expiration - DISCREPANCY)
			
 
				+
			
 
				+                self.leader_queue.clear()
			
 
				+                for peer, peer_expiration_time in new_peers:
			
 
				+                    if peer == self.endpoint or (peer, peer_expiration_time) in self.past_attempts:
			
 
				+                        continue
			
 
				+                    self.leader_queue.store(peer, peer_expiration_time, peer_expiration_time)
			
 
				+                    self.max_assured_time = max(self.max_assured_time, peer_expiration_time - DISCREPANCY)
			
 
				 
			
 
				-            self.update_finished.set()
			
 
				+                self.update_finished.set()
			
 
				 
			
 
				-            await asyncio.wait(
			
 
				-                {self.running.wait(), self.update_triggered.wait()}, return_when=asyncio.ALL_COMPLETED,
			
 
				-                timeout=self.search_end_time - get_dht_time() if isfinite(self.search_end_time) else None)
			
 
				-            self.update_triggered.clear()
			
 
				+                await asyncio.wait(
			
 
				+                    {self.running.wait(), self.update_triggered.wait()}, return_when=asyncio.ALL_COMPLETED,
			
 
				+                    timeout=self.search_end_time - get_dht_time() if isfinite(self.search_end_time) else None)
			
 
				+                self.update_triggered.clear()
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"{self.endpoint} - caught {type(e)}: {e}")
			
 
				+            raise
			
 
				 
			
 
				-    async def _declare_averager_periodically(self, group_key: GroupKey):
			
 
				+    async def _declare_averager_periodically(self, key_manager: GroupKeyManager):
			
 
				         async with self.lock_declare:
			
 
				             try:
			
 
				                 while True:
			
 
				                     await self.running.wait()
			
 
				 
			
 
				                     new_expiration_time = min(get_dht_time() + self.averaging_expiration, self.search_end_time)
			
 
				-                    self.declared_group_key, self.declared_expiration_time = group_key, new_expiration_time
			
 
				+                    self.declared_group_key = group_key = key_manager.current_key
			
 
				+                    self.declared_expiration_time = new_expiration_time
			
 
				                     self.declared_expiration.set()
			
 
				-                    await self.dht.declare_averager(group_key, self.endpoint, new_expiration_time,
			
 
				-                                                    looking_for_group=True, return_future=True)
			
 
				+                    await key_manager.declare_averager(group_key, self.endpoint, expiration_time=new_expiration_time)
			
 
				                     await asyncio.sleep(self.declared_expiration_time - get_dht_time())
			
 
				+                    if self.running.is_set() and len(self.leader_queue) == 0:
			
 
				+                        await key_manager.update_key_on_not_enough_peers()
			
 
				+
			
 
				             except Exception as e:  # note: we catch exceptions here because otherwise they are never printed
			
 
				                 logger.error(f"{self.endpoint} - caught {type(e)}: {e}")
			
 
				             finally:
			
@@ -468,8 +471,8 @@ class PotentialLeaders:
 
				                     prev_declared_key, prev_expiration_time = self.declared_group_key, self.declared_expiration_time
			
 
				                     self.declared_group_key, self.declared_expiration_time = None, float('inf')
			
 
				                     self.leader_queue, self.max_assured_time = TimedStorage[Endpoint, DHTExpiration](), float('-inf')
			
 
				-                    await self.dht.declare_averager(prev_declared_key, self.endpoint, prev_expiration_time,
			
 
				-                                                    looking_for_group=False, return_future=True)
			
 
				+                    await key_manager.declare_averager(prev_declared_key, self.endpoint, prev_expiration_time,
			
 
				+                                                       looking_for_group=False)
			
 
				 
			
 
				 
			
 
				 def compute_schema_hash(tensors: Sequence[torch.Tensor]) -> bytes:
			
@@ -478,3 +481,7 @@ def compute_schema_hash(tensors: Sequence[torch.Tensor]) -> bytes:
 
				                      for field_name, field_value in asdict(TensorDescriptor.from_tensor(tensor)).items()}
			
 
				                     for tensor in tensors]
			
 
				     return DHTID.generate(source=MSGPackSerializer.dumps(schema_dicts)).to_bytes()
			
 
				+
			
 
				+
			
 
				+class MatchmakingException(Exception):
			
 
				+    """ An internal exception that marks undesired edge cases during averaging """
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -12,6 +12,7 @@ The code is organized as follows:
 
				 - [1] Maymounkov P., Mazieres D. (2002) Kademlia: A Peer-to-Peer Information System Based on the XOR Metric.
			
 
				 - [2] https://github.com/bmuller/kademlia , Brian, if you're reading this: THANK YOU! you're awesome :)
			
 
				 """
			
 
				+from __future__ import annotations
			
 
				 import asyncio
			
 
				 import ctypes
			
 
				 import heapq
			
@@ -21,12 +22,11 @@ from collections import deque
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				 from typing import List, Tuple, Optional, Sequence, Union, Dict, Deque, NamedTuple, Iterator, Set
			
 
				 
			
 
				-from numpy import nextafter
			
 
				 
			
 
				 from hivemind.client import RemoteExpert
			
 
				 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
			
 
				-from hivemind.dht.routing import get_dht_time, DHTValue
			
 
				-from hivemind.utils import MPFuture, Endpoint, Hostname, get_logger, switch_to_uvloop, strip_port
			
 
				+from hivemind.dht.routing import get_dht_time, DHTValue, DHTKey, Subkey
			
 
				+from hivemind.utils import MPFuture, Endpoint, Hostname, get_logger, switch_to_uvloop, strip_port, ValueWithExpiration
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
@@ -37,8 +37,6 @@ FLAT_EXPERT = -1  # grid prefix reserved for storing 1d expert uids. Used to spe
 
				 UID_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))+$')  # e.g. ffn_expert.98.76.54 - prefix + some dims
			
 
				 PREFIX_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))*[.]$')  # e.g. expert. or ffn.45. (ends with ".")
			
 
				 #  formally, prefixes = {uid.split(UID_DELIMITER)[:length] for length in range(1, uid.count(UID_DELIMITER) + 2)}
			
 
				-GroupKey = str
			
 
				-GROUP_PATTERN = re.compile('^(([^.])+)[.]0b[01]+$')  # e.g. bert_exp4_averaging.0b01001101
			
 
				 
			
 
				 
			
 
				 def is_valid_uid(maybe_uid: str) -> bool:
			
@@ -50,12 +48,6 @@ def is_valid_prefix(maybe_prefix: str) -> bool:
 
				     """ An uid prefix must contain a string expert type, followed by optional numeric indices and a trailing period """
			
 
				     return bool(PREFIX_PATTERN.fullmatch(maybe_prefix))
			
 
				 
			
 
				-
			
 
				-def is_valid_group(maybe_group: str) -> bool:
			
 
				-    """ A group identifier must contain group type, followed by one or more .-separated indices, and any ?metadata"""
			
 
				-    return bool(GROUP_PATTERN.fullmatch(maybe_group))
			
 
				-
			
 
				-
			
 
				 def split_uid(uid_or_prefix: Union[ExpertUID, ExpertPrefix]) -> Tuple[ExpertPrefix, Coordinate]:
			
 
				     """ Separate an expert UID or prefix into a new ExpertPrefix and integer for the last coordinate """
			
 
				     uid_or_prefix = uid_or_prefix.rstrip(UID_DELIMITER)
			
@@ -180,6 +172,54 @@ class DHT(mp.Process):
 
				     def port(self) -> Optional[int]:
			
 
				         return self._port.value if self._port.value != 0 else None
			
 
				 
			
 
				+    def get(self, key: DHTKey, latest: bool = False, return_future: bool = False, **kwargs
			
 
				+            ) -> Union[Optional[ValueWithExpiration[DHTValue]], MPFuture]:
			
 
				+        """
			
 
				+        Search for a key across DHT and return either first or latest entry (if found).
			
 
				+        :param key: same key as in node.store(...)
			
 
				+        :param latest: if True, finds the latest value, otherwise finds any non-expired value (which is much faster)
			
 
				+        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+        :param kwargs: parameters forwarded to DHTNode.get_many_by_id
			
 
				+        :returns: (value, expiration time); if value was not found, returns None
			
 
				+        """
			
 
				+        future, _future = MPFuture.make_pair()
			
 
				+        self.pipe.send(('_get', [], dict(key=key, latest=latest, future=_future, **kwargs)))
			
 
				+        return future if return_future else future.result()
			
 
				+
			
 
				+    async def _get(self, node: DHTNode, key: DHTKey, latest: bool, future: MPFuture, **kwargs):
			
 
				+        try:
			
 
				+            result = await node.get(key, latest=latest, **kwargs)
			
 
				+            if not future.done():
			
 
				+                future.set_result(result)
			
 
				+        except BaseException as e:
			
 
				+            if not future.done():
			
 
				+                future.set_exception(e)
			
 
				+            raise
			
 
				+
			
 
				+    def store(self, key: DHTKey, value: DHTValue, expiration_time: DHTExpiration,
			
 
				+              subkey: Optional[Subkey] = None, return_future: bool = False, **kwargs) -> Union[bool, MPFuture]:
			
 
				+        """
			
 
				+        Find num_replicas best nodes to store (key, value) and store it there until expiration time.
			
 
				+        :note: store is a simplified interface to store_many, all kwargs are be forwarded there
			
 
				+        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+        :returns: True if store succeeds, False if it fails (due to no response or newer value)
			
 
				+        """
			
 
				+        future, _future = MPFuture.make_pair()
			
 
				+        self.pipe.send(('_store', [], dict(key=key, value=value, expiration_time=expiration_time, subkey=subkey,
			
 
				+                                           future=_future, **kwargs)))
			
 
				+        return future if return_future else future.result()
			
 
				+
			
 
				+    async def _store(self, node: DHTNode, key: DHTKey, value: DHTValue, expiration_time: DHTExpiration,
			
 
				+                     subkey: Optional[Subkey], future: MPFuture, **kwargs):
			
 
				+        try:
			
 
				+            result = await node.store(key, value, expiration_time, subkey=subkey, **kwargs)
			
 
				+            if not future.done():
			
 
				+                future.set_result(result)
			
 
				+        except BaseException as e:
			
 
				+            if not future.done():
			
 
				+                future.set_exception(e)
			
 
				+            raise
			
 
				+
			
 
				     def get_visible_address(self, num_peers: Optional[int] = None, peers: Sequence[Endpoint] = ()) -> Hostname:
			
 
				         """
			
 
				         Get this machine's visible address by requesting other peers or using pre-specified network addresses.
			
@@ -519,69 +559,3 @@ class DHT(mp.Process):
 
				         if future is not None:
			
 
				             future.set_result(best_experts_batch)
			
 
				         return best_experts_batch
			
 
				-
			
 
				-    def declare_averager(self, group_key: GroupKey, endpoint: Endpoint, expiration_time: float, *,
			
 
				-                         looking_for_group: bool = True, return_future: bool = False) -> Union[bool, MPFuture]:
			
 
				-        """
			
 
				-        Add (or remove) the averager to a given allreduce bucket
			
 
				-
			
 
				-        :param group_key: allreduce group key, e.g. my_averager.0b011011101
			
 
				-        :param endpoint: averager public endpoint for incoming requests
			
 
				-        :param expiration_time: intent to run allreduce before this timestamp
			
 
				-        :param looking_for_group: by default (True), declare the averager as "looking for group" in a given group;
			
 
				-          If False, this will instead mark that the averager as no longer looking for group, (e.g. it already finished)
			
 
				-        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				-        :return: True if declared, False if declaration was rejected by DHT peers
			
 
				-        :note: when leaving (i.e. is_active=False), please specify the same expiration_time as when entering the group
			
 
				-        :note: setting is_active=False does *not* guarantee that others will immediately stop to query you.
			
 
				-        """
			
 
				-        assert is_valid_group(group_key), f"Group key {group_key} is invalid, must follow {GROUP_PATTERN}"
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_declare_averager', [],
			
 
				-                        dict(group_key=group_key, endpoint=endpoint, expiration_time=expiration_time,
			
 
				-                             looking_for_group=looking_for_group, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _declare_averager(self, node: DHTNode, *, group_key: str, endpoint: Endpoint,
			
 
				-                                expiration_time: DHTExpiration, looking_for_group: bool, future: MPFuture):
			
 
				-        try:
			
 
				-            expiration_time = expiration_time if looking_for_group else float(nextafter(expiration_time, float('inf')))
			
 
				-            # ^-- when declaring averager inactive, we increment expiration time to overwrite the pre-existing entry
			
 
				-            store_ok = await node.store(
			
 
				-                key=group_key, subkey=endpoint, value=looking_for_group, expiration_time=expiration_time)
			
 
				-            future.set_result(store_ok)
			
 
				-        except Exception as e:
			
 
				-            if not future.done():
			
 
				-                future.set_exception(e)
			
 
				-
			
 
				-    def get_averagers(self, group_key: GroupKey, *, only_active: bool = True, return_future: bool = False
			
 
				-                      ) -> Union[List[Tuple[Endpoint, DHTExpiration]], MPFuture]:
			
 
				-        """
			
 
				-        Find and return averagers in a specified all-reduce bucket
			
 
				-
			
 
				-        :param group_key: finds averagers that have the this group key, e.g. my_averager.0b011011101
			
 
				-        :param only_active: if True, return only active averagers that are looking for group (i.e. with value = True)
			
 
				-            if False, return all averagers under a given group_key regardless of value
			
 
				-        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				-        :return: endpoints and expirations of every matching averager
			
 
				-        """
			
 
				-        assert is_valid_group(group_key), f"Group key {group_key} is invalid, must follow {GROUP_PATTERN}"
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_get_averagers', [], dict(group_key=group_key, only_active=only_active, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _get_averagers(self, node: DHTNode, *, group_key: str, only_active: bool, future: MPFuture):
			
 
				-        try:
			
 
				-            result = await node.get(group_key, latest=True)
			
 
				-            if result is None:
			
 
				-                logger.debug(f"Allreduce group not found: {group_key}, creating new group.")
			
 
				-                future.set_result([])
			
 
				-                return
			
 
				-            assert isinstance(result.value, dict), f"expected {group_key} to be a Dict[Endpoint, is_active], " \
			
 
				-                                                   f"but got {result.value} of type {type(result.value)}."
			
 
				-            averagers = [(endpoint, entry.expiration_time) for endpoint, entry in result.value.items()
			
 
				-                         if not only_active or entry.value is True]
			
 
				-            future.set_result(averagers)
			
 
				-        except Exception as e:
			
 
				-            if not future.done():
			
 
				-                future.set_exception(e)
			
--- a/hivemind/proto/averaging.proto
+++ b/hivemind/proto/averaging.proto
@@ -44,6 +44,7 @@ message MessageFromLeader {
 
				   repeated string ordered_group_endpoints = 4;  // a sequence of peers, each responsible for one shard during averaging
			
 
				   repeated int32 part_sizes = 5;  // a sequence of tensor parts assigned to each peer, same order as endpoints
			
 
				   repeated bytes gathered = 6;  // metadata (gather) from all groupmates in the same order as their endoints
			
 
				+  int32 group_key_seed = 7;  // a random seed used by peers to update their group keys
			
 
				 }
			
 
				 
			
 
				 message AveragingData {
			
--- a/tests/benchmark_averaging.py
+++ b/tests/benchmark_averaging.py
@@ -1,3 +1,4 @@
 
				+import math
			
 
				 import time
			
 
				 import threading
			
 
				 import argparse
			
@@ -31,6 +32,8 @@ def benchmark_averaging(num_peers: int, target_group_size: int, num_rounds: int,
 
				                         averaging_expiration: float, request_timeout: float, round_timeout: float,
			
 
				                         hid_size: int, num_layers: int, spawn_dtime: float):
			
 
				     dht_root = hivemind.DHT(listen_on=f'{LOCALHOST}:*', start=True)
			
 
				+    num_groups = 2 ** int(round(math.log2(num_peers / target_group_size)))
			
 
				+    nbits = int(round(math.log2(num_groups)))
			
 
				     peer_tensors = [sample_tensors(hid_size, num_layers)
			
 
				                     for _ in range(num_peers)]
			
 
				     processes = {dht_root}
			
@@ -39,8 +42,9 @@ def benchmark_averaging(num_peers: int, target_group_size: int, num_rounds: int,
 
				         dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
			
 
				                            initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
			
 
				                            start=True)
			
 
				+        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
			
 
				         averager = hivemind.DecentralizedAverager(
			
 
				-            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits='0110', listen_on=f"{LOCALHOST}:*",
			
 
				+            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*",
			
 
				             compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size,
			
 
				             averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True)
			
 
				         processes.update({dht, averager})
			
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -7,31 +7,38 @@ import pytest
 
				 import hivemind
			
 
				 from hivemind.client.averaging.allreduce import AllReduceProtocol, split_into_parts, restore_from_parts
			
 
				 from hivemind.client.averaging.load_balancing import load_balance_peers
			
 
				+from hivemind.client.averaging.key_manager import GroupKeyManager
			
 
				 from hivemind.utils import Endpoint
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_getset_averagers():
			
 
				-    dht = hivemind.DHT(start=True)
			
 
				+@pytest.mark.asyncio
			
 
				+async def test_key_manager():
			
 
				+    key_manager = GroupKeyManager(hivemind.DHT(start=True), endpoint='localhvost',
			
 
				+                                  prefix='test_averaging', initial_group_bits='10110',
			
 
				+                                  target_group_size=2)
			
 
				 
			
 
				     t = hivemind.get_dht_time()
			
 
				-    dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 60)
			
 
				-    dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', expiration_time=t + 61)
			
 
				+    key = key_manager.current_key
			
 
				+    await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 60)
			
 
				+    await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61)
			
 
				+
			
 
				+    q1 = await key_manager.get_averagers(key, only_active=True)
			
 
				 
			
 
				-    q1 = dht.get_averagers('bucket.0b10110', only_active=True)
			
 
				+    await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 66)
			
 
				+    q2 = await key_manager.get_averagers(key, only_active=True)
			
 
				 
			
 
				-    dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 66)
			
 
				-    q2 = dht.get_averagers('bucket.0b10110', only_active=True)
			
 
				+    await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61, looking_for_group=False)
			
 
				+    q3 = await key_manager.get_averagers(key, only_active=True)
			
 
				+    q4 = await key_manager.get_averagers(key, only_active=False)
			
 
				 
			
 
				-    dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', looking_for_group=False,
			
 
				-                         expiration_time=t + 61)
			
 
				-    q3 = dht.get_averagers('bucket.0b10110', only_active=True)
			
 
				-    q4 = dht.get_averagers('bucket.0b10110', only_active=False)
			
 
				+    q5 = await key_manager.get_averagers('nonexistent_key.0b0101', only_active=False)
			
 
				 
			
 
				     assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2', t + 61) in q1
			
 
				     assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2', t + 61) in q2
			
 
				     assert len(q3) == 1 and ('localhvost', t + 66) in q3
			
 
				     assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2', t + 61) in q2
			
 
				+    assert len(q5) == 0
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
@@ -46,7 +53,7 @@ def test_allreduce_once():
 
				     reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4 for i in range(len(tensors1))]
			
 
				 
			
 
				     averagers = [hivemind.DecentralizedAverager(tensors, dht=dht, target_group_size=4, averaging_expiration=15,
			
 
				-                                                prefix='mygroup', initial_group_bits='0110', listen_on='127.0.0.1:*',
			
 
				+                                                prefix='mygroup', listen_on='127.0.0.1:*',
			
 
				                                                 start=True)
			
 
				                  for tensors in [tensors1, tensors2, tensors3, tensors4]]
			
 
				 
			
@@ -64,6 +71,44 @@ def test_allreduce_once():
 
				                 assert torch.allclose(ref, our, atol=1e-6)
			
 
				 
			
 
				 
			
 
				+def compute_mean_std(averagers, unbiased=True):
			
 
				+    results = []
			
 
				+    for averager in averagers:
			
 
				+        with averager.get_tensors() as tensors:
			
 
				+            results.append([tensor.clone() for tensor in tensors])
			
 
				+
			
 
				+    results_stacked_per_tensor = list(map(torch.stack, zip(*results)))
			
 
				+    means = [stack.mean(dim=0) for stack in results_stacked_per_tensor]
			
 
				+    stds = [stack.std(dim=0, unbiased=unbiased) for stack in results_stacked_per_tensor]
			
 
				+    return means, stds
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_allreduce_grid():
			
 
				+    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
			
 
				+    averagers = [hivemind.DecentralizedAverager(
			
 
				+        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
			
 
				+        prefix='mygroup', initial_group_bits=bin(i // 2)[2:].rjust(2, '0'), start=True)
			
 
				+        for i in range(8)]
			
 
				+
			
 
				+    [means0], [stds0] = compute_mean_std(averagers)
			
 
				+    assert not torch.allclose(stds0, torch.zeros_like(stds0))
			
 
				+
			
 
				+    prev_means, prev_stds = means0, stds0
			
 
				+
			
 
				+    for i in range(5):
			
 
				+        step_futures = [averager.step(wait=False) for averager in averagers]
			
 
				+        groups = [future.result() for future in step_futures]
			
 
				+        [means], [stds] = compute_mean_std(averagers)
			
 
				+        assert torch.allclose(means, prev_means, atol=1e-6, rtol=0)
			
 
				+        assert all(len(group) == 2 for group in groups)
			
 
				+
			
 
				+        if i <= 2:
			
 
				+            assert torch.all(torch.le(stds, prev_stds))
			
 
				+        else:
			
 
				+            assert torch.allclose(stds, torch.zeros_like(stds), atol=1e-6, rtol=0)
			
 
				+
			
 
				+
			
 
				 @pytest.mark.forked
			
 
				 def test_allgather():
			
 
				     dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')
			
@@ -190,3 +235,29 @@ def test_load_balancing():
 
				         assignment = load_balance_peers(vector_size, throughputs, min_size)
			
 
				         assert np.sum(assignment) == vector_size
			
 
				         assert np.min(assignment) >= 0
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_too_few_peers():
			
 
				+    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
			
 
				+    averagers = [hivemind.DecentralizedAverager(
			
 
				+        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
			
 
				+        averaging_expiration=1, request_timeout=0.5,
			
 
				+        prefix='mygroup', initial_group_bits=bin(i)[2:].rjust(3, '0'), start=True)
			
 
				+        for i in range(4)]
			
 
				+    step_futures = [averager.step(wait=False) for averager in averagers]
			
 
				+    for future in step_futures:
			
 
				+        assert len(future.result()) == 2
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_overcrowded():
			
 
				+    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
			
 
				+    averagers = [hivemind.DecentralizedAverager(
			
 
				+        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
			
 
				+        averaging_expiration=1, request_timeout=0.5,
			
 
				+        prefix='mygroup', initial_group_bits='', start=True)
			
 
				+        for _ in range(32)]
			
 
				+    for t in range(5):
			
 
				+        step_futures = [averager.step(wait=False, timeout=5) for averager in averagers]
			
 
				+        assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1