4 ani în urmă · f132294edb
--- a/docs/_static/dht.odp
+++ b/docs/_static/dht.odp
--- a/docs/_static/dht.png
+++ b/docs/_static/dht.png
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -64,7 +64,6 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				     :param listen: if True (default), this averager will accept incoming requests from other peers and perform allreduce
			
 
				             if False, the averager will register as a freeloader and attempt to fetch vectors from other averagers
			
 
				     :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
			
 
				-    :param receiver_threads: uses this many threads to await on input pipe. Default = 1 should be enough in most cases
			
 
				     :param channel_options: options for grpc.aio.insecure_channel, e.g. [('grpc.enable_retries', 0)]
			
 
				           see https://grpc.github.io/grpc/core/group__grpc__arg__keys.html for a list of all options
			
 
				     :param kwargs: extra parameters forwarded to grpc.aio.server
			
@@ -91,7 +90,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				                  allreduce_timeout: Optional[float] = None, averaging_alpha: float = 1.0,
			
 
				                  compression_type: runtime_pb2.CompressionType = runtime_pb2.CompressionType.NONE,
			
 
				                  throughput: Optional[float] = None, min_vector_size: int = 0,
			
 
				-                 listen: bool = True, listen_on: Endpoint = '0.0.0.0:*', receiver_threads: int = 1, daemon: bool = True,
			
 
				+                 listen: bool = True, listen_on: Endpoint = '0.0.0.0:*', daemon: bool = True,
			
 
				                  channel_options: Optional[Sequence[Tuple[str, Any]]] = None, **kwargs):
			
 
				         assert '.' not in prefix, "group prefix must be a string without trailing '.'"
			
 
				         assert throughput is None or (throughput >= 0 and np.isfinite(np.float32(throughput))), \
			
@@ -102,7 +101,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				 
			
 
				         super().__init__()
			
 
				         self.dht = dht
			
 
				-        self.listen, self.listen_on, self.receiver_threads, self.kwargs = listen, listen_on, receiver_threads, kwargs
			
 
				+        self.listen, self.listen_on, self.kwargs = listen, listen_on, kwargs
			
 
				         self.channel_options = channel_options
			
 
				         self.daemon = daemon
			
 
				 
			
@@ -155,7 +154,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
				         """ Serve DecentralizedAverager forever. This function will not return until the averager is shut down """
			
 
				         loop = switch_to_uvloop()
			
 
				         # initialize asyncio synchronization primitives in this event loop
			
 
				-        pipe_awaiter = ThreadPoolExecutor(self.receiver_threads)
			
 
				+        pipe_awaiter = ThreadPoolExecutor(max_workers=1)
			
 
				 
			
 
				         async def _run():
			
 
				             grpc.aio.init_grpc_aio()
			
--- a/hivemind/client/beam_search.py
+++ b/hivemind/client/beam_search.py
@@ -0,0 +1,273 @@
 
				+import asyncio
			
 
				+import heapq
			
 
				+from collections import deque
			
 
				+from functools import partial
			
 
				+from typing import Sequence, Optional, List, Tuple, Dict, Deque, Union, Set, Iterator
			
 
				+
			
 
				+from hivemind.dht import DHT, DHTNode
			
 
				+from hivemind.client.expert import RemoteExpert
			
 
				+from hivemind.server.expert_uid import (ExpertUID, ExpertPrefix, FLAT_EXPERT, UidEndpoint, Score, Coordinate,
			
 
				+                                        PREFIX_PATTERN, UID_DELIMITER, is_valid_prefix)
			
 
				+from hivemind.utils import get_logger, get_dht_time, MPFuture
			
 
				+
			
 
				+logger = get_logger(__name__)
			
 
				+
			
 
				+
			
 
				+class MoEBeamSearcher:
			
 
				+    """
			
 
				+    Utility class that uses DHT to find most suitable experts for RemoteMixtureOfExperts.
			
 
				+    Each expert has an identifier in the form of {prefix}.{i}.{j}.{...}, e.g. "ffn_expert.98.76.54.32.10"
			
 
				+    An expert identifier consists of:
			
 
				+
			
 
				+        * optional prefix that determines expert role, experiment name, etc.
			
 
				+        * one or more integers that determine that expert's position in an N-dimensional grid
			
 
				+
			
 
				+    A hivemind.Server can ``DHT.declare_experts(expert_uids: List[str])`` to make its experts visible to everyone.
			
 
				+    When declaring experts, DHT will store each expert's uid and all its prefixes until :expiration: (specified at init)
			
 
				+    For instance, declaring "ffn_expert.98.76.54.32.10" will store the following keys in a DHT:
			
 
				+    ``"ffn_expert.98", "ffn_expert.98.76", "ffn_expert.98.76.54", ..., "ffn_expert.98.76.54.32.10"``
			
 
				+
			
 
				+    In order to enable fast beam search, DHT maintains dictionaries of all active suffixes for every prefix
			
 
				+    (e.g. "ffn_expert.98": {76: ffn_expert.98.76...., 123: ffn_expert.98.123..., 225: ffn_expert.98.225....}))
			
 
				+
			
 
				+    RemoteMixtureOfExperts can use these prefixes to find top-k most suitable experts with a left-to-right beam search.
			
 
				+    For instance, consider RemoteMixtureOfExperts with prefix "ffn_expert" and grid size [100, 100, 100, 100, 100].
			
 
				+    This MoE can query all experts with that prefix and arbitrary indices in 0...99 along each dimension.
			
 
				+    However, not every expert in such 100^5 grid can be alive at a given moment of time (the grid size is redundant).
			
 
				+    In order to find k best "alive" experts, MoE first ranks indices along the first dimension with its gating function.
			
 
				+    It can then check which of those indices correspond to "alive" experts by querying keys such as "ffn_expert.98".
			
 
				+
			
 
				+    After selecting k best indices along first dimension, MoE moves to the second dimension.
			
 
				+    It can find top-k index pairs (e.g. "expert.98.76") that use one of k best indices from the previous step.
			
 
				+    This beam search explores one additional dimension per step and finds k best experts from across the DHT
			
 
				+    in O(k * num_dimensions * dimension_size) time depending on the chosen grid dimensions.
			
 
				+
			
 
				+    :param dht: a running DHT daemon that is used for beam search AND local caching
			
 
				+    :param uid_prefix: search for experts whose uids start with this prefix
			
 
				+    :param grid_size: dimensions that form expert uid (see above)
			
 
				+    :param num_workers: number of concurrent DHT coroutines per beam search
			
 
				+    :param negative_caching: if True, whenever DHT is unable to find an expert or prefix, it will cache the "no key"
			
 
				+      result inside the DHT for :expiration: seconds. Caching only affects beam search and has three main effects:
			
 
				+
			
 
				+      1. Faster beam search under node failures: if there are inconsistencies in DHT keys, such as a prefix pointing to
			
 
				+         a now-defunct expert, these inconsistencies will be overwritten by the first peer that stumbles upon them. As a
			
 
				+         result, beam search will not have to wait for non-existent experts until the expiration of their DHT entries;
			
 
				+      2. Delayed expert availability: Without negative cache, new experts are always immediately available for beam
			
 
				+         search after they are published to the DHT. With negative cache, there are rare cases (e.g. when adding new
			
 
				+         experts in place of recently defunct ones) when new experts will be initially invisible, but gradually become
			
 
				+         visible to more peers as those peers refresh their cache. This process takes at most :expiration: seconds;
			
 
				+      3. Faster beam search in very sparse grids: there is one edge case where negative cache will improve beam search
			
 
				+         performance; If an expert grid is very sparse, there can be empty indices in the first grid dimension (i.e.
			
 
				+         indices {i} such that _no_ experts that start with "{prefix}.{i}.*"). If so, the default beam search will
			
 
				+         be very slow due to the way it forms initial beam. Beam search with negative cache enabled will run normally.
			
 
				+         Though, this is a pathological case (e.g. only 90 experts in an oversized 100x100 grid) that should be avoided.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dht: DHT, uid_prefix: ExpertPrefix, grid_size: Optional[Tuple[int, ...]] = None,
			
 
				+                 num_workers: Optional[int] = None, negative_caching: bool = True, **kwargs):
			
 
				+        if not uid_prefix.endswith(UID_DELIMITER):
			
 
				+            uid_prefix += UID_DELIMITER
			
 
				+            logger.info(f"Prefix must end with '{UID_DELIMITER}'. Changing to {uid_prefix}{UID_DELIMITER}")
			
 
				+        assert is_valid_prefix(uid_prefix), f"Prefix '{uid_prefix}' is invalid."
			
 
				+        self.dht = dht
			
 
				+        self.uid_prefix, self.grid_size = uid_prefix, grid_size
			
 
				+        self.negative_caching, self.num_workers, self.dht_kwargs = negative_caching, num_workers, kwargs
			
 
				+
			
 
				+    def get_initial_beam(self, scores: Sequence[float], beam_size: int, return_future: bool = False
			
 
				+                         ) -> List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]]:
			
 
				+        """
			
 
				+        :param scores: prefer suffix coordinates that have highest scores
			
 
				+        :param beam_size: select this many active suffixes with highest scores
			
 
				+        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+        :returns: a list of up to beam_size tuples of (prefix score, prefix itself, dict{suffix: example expert})
			
 
				+        """
			
 
				+        return self.dht.run_coroutine(partial(self._get_initial_beam, prefix=self.uid_prefix, beam_size=beam_size,
			
 
				+                                              scores=tuple(scores), negative_caching=self.negative_caching,
			
 
				+                                              num_workers=self.num_workers), return_future)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    async def _get_initial_beam(dht: DHT, node: DHTNode, prefix: ExpertPrefix, beam_size: int,
			
 
				+                                scores: Tuple[float, ...], negative_caching: bool, num_workers: Optional[int] = None
			
 
				+                                ) -> List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]]:
			
 
				+        num_workers = num_workers or dht.max_workers or beam_size
			
 
				+        beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]] = []
			
 
				+        unattempted_indices: List[Coordinate] = sorted(range(len(scores)), key=scores.__getitem__)  # from worst to best
			
 
				+        pending_tasks: Deque[Tuple[Coordinate, ExpertPrefix, asyncio.Task]] = deque()
			
 
				+
			
 
				+        while len(beam) < beam_size and (unattempted_indices or pending_tasks):
			
 
				+            # dispatch additional tasks
			
 
				+            while unattempted_indices and len(pending_tasks) < num_workers:
			
 
				+                next_index = unattempted_indices.pop()  # note: this is best unattempted index because of sort order
			
 
				+                next_best_prefix = f"{prefix}{next_index}{UID_DELIMITER}"
			
 
				+                pending_tasks.append((next_index, next_best_prefix, asyncio.create_task(node.get(next_best_prefix))))
			
 
				+
			
 
				+            # await the next best prefix to be fetched
			
 
				+            pending_best_index, pending_best_prefix, pending_task = pending_tasks.popleft()
			
 
				+            try:
			
 
				+                maybe_prefix_data = await pending_task
			
 
				+                if maybe_prefix_data is not None and isinstance(maybe_prefix_data.value, dict):
			
 
				+                    successors = {coord: UidEndpoint(*match.value) for coord, match in maybe_prefix_data.value.items()
			
 
				+                                  if isinstance(coord, Coordinate) and isinstance(getattr(match, 'value', None), list)
			
 
				+                                  and len(match.value) == 2}
			
 
				+                    if successors:
			
 
				+                        beam.append((scores[pending_best_index], pending_best_prefix, successors))
			
 
				+                elif maybe_prefix_data is None and negative_caching:
			
 
				+                    logger.debug(f"DHT negative caching: storing a 'no prefix' entry for {pending_best_prefix}")
			
 
				+                    asyncio.create_task(node.store(pending_best_prefix, subkey=-1, value=None,
			
 
				+                                                   expiration_time=get_dht_time() + dht.default_expiration))
			
 
				+
			
 
				+            except asyncio.CancelledError:
			
 
				+                for _, pending_task in pending_tasks:
			
 
				+                    pending_task.cancel()
			
 
				+                raise
			
 
				+        return beam
			
 
				+
			
 
				+    def get_active_successors(self, prefixes: List[ExpertPrefix], grid_size: Optional[int] = None,
			
 
				+                              return_future: bool = False) -> Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]]:
			
 
				+        """
			
 
				+        :param prefixes: a list of prefix for which to find active successor uids
			
 
				+        :param grid_size: if specified, only return successors if ther are in range [0, grid_size)
			
 
				+        :param return_future: if False (default), find and return successors. Otherwise return MPFuture and fill later.
			
 
				+        :returns: for every expert, return a dict{active_next_coordinate: (matching_expert_uid, matching_endpoint)}
			
 
				+        :note: if a prefix is not found, get_active_successors will return an empty dictionary for that prefix
			
 
				+        """
			
 
				+        assert not isinstance(prefixes, str), "Please send a list / tuple of expert prefixes."
			
 
				+        for prefix in prefixes:
			
 
				+            assert is_valid_prefix(prefix), f"prefix '{prefix}' is invalid, it must follow {PREFIX_PATTERN.pattern}"
			
 
				+        return self.dht.run_coroutine(partial(
			
 
				+            self._get_active_successors, prefixes=list(prefixes), grid_size=grid_size,
			
 
				+            negative_caching=self.negative_caching, num_workers=self.num_workers), return_future=return_future)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    async def _get_active_successors(dht: DHT, node: DHTNode, prefixes: List[ExpertPrefix], grid_size: Optional[int],
			
 
				+                                     negative_caching: bool, num_workers: Optional[int] = None
			
 
				+                                     ) -> Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]]:
			
 
				+        grid_size = grid_size or float('inf')
			
 
				+        num_workers = num_workers or min(len(prefixes), dht.max_workers or len(prefixes))
			
 
				+        dht_responses = await node.get_many(keys=prefixes, num_workers=num_workers)
			
 
				+        successors: Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]] = {}
			
 
				+        for prefix, found in dht_responses.items():
			
 
				+            if found and isinstance(found.value, dict):
			
 
				+                successors[prefix] = {coord: UidEndpoint(*match.value) for coord, match in found.value.items()
			
 
				+                                      if isinstance(coord, Coordinate) and 0 <= coord < grid_size
			
 
				+                                      and isinstance(getattr(match, 'value', None), list) and len(match.value) == 2}
			
 
				+            else:
			
 
				+                successors[prefix] = {}
			
 
				+                if found is None and negative_caching:
			
 
				+                    logger.debug(f"DHT negative caching: storing a 'no prefix' entry for {prefix}")
			
 
				+                    asyncio.create_task(node.store(prefix, subkey=-1, value=None,
			
 
				+                                                   expiration_time=get_dht_time() + dht.default_expiration))
			
 
				+        return successors
			
 
				+
			
 
				+    def find_best_experts(self, grid_scores: Sequence[Sequence[float]], beam_size: int, return_future: bool = False
			
 
				+                          ) -> Union[List[RemoteExpert], MPFuture[RemoteExpert]]:
			
 
				+        """
			
 
				+        Find and return :beam_size: active experts with highest scores, use both local cache and DHT
			
 
				+
			
 
				+        :param grid_scores: scores predicted for each dimension in the grid
			
 
				+        :type grid_scores: model scores for each grid dimension, list of arrays of shape grid_size[i]
			
 
				+        :param beam_size: how many best experts should beam search return
			
 
				+         After time_budget is reached, beam search won't search for more experts and instead fall back on local cache
			
 
				+         Please note that any queries that fall outside the budget will still be performed in background and cached
			
 
				+         for subsequent iterations as long as DHTNode.cache_locally is True
			
 
				+        :param num_workers: use up to this many concurrent workers to search DHT
			
 
				+        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				+        :returns: a list that contains *up to* k_best RemoteExpert instances
			
 
				+        """
			
 
				+        assert (not self.grid_size or len(grid_scores) == len(self.grid_size)) and beam_size > 0
			
 
				+        return self.dht.run_coroutine(partial(self._find_best_experts, prefix=self.uid_prefix, beam_size=beam_size,
			
 
				+                                              grid_scores=list(grid_scores), negative_caching=self.negative_caching,
			
 
				+                                              num_workers=self.num_workers), return_future)
			
 
				+
			
 
				+    @classmethod
			
 
				+    async def _find_best_experts(
			
 
				+            cls, dht: DHT, node: DHTNode, prefix: str, grid_scores: List[Tuple[float]], beam_size: int,
			
 
				+            negative_caching: bool, num_workers: Optional[int] = None) -> List[RemoteExpert]:
			
 
				+        num_workers = num_workers or min(beam_size, dht.max_workers or beam_size)
			
 
				+
			
 
				+        # form initial beam from top-k active L1 prefixes, each row is (score, uid prefix, possible suffixes)
			
 
				+        beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]] = await cls._get_initial_beam(
			
 
				+            dht, node, prefix, beam_size, grid_scores[0], negative_caching, min(beam_size, num_workers))
			
 
				+
			
 
				+        best_experts_heap: List[Tuple[Score, UidEndpoint]] = []  # max-heap of expert uids/endpoints ordered by scores
			
 
				+        unique_experts: Set[ExpertUID] = set()
			
 
				+
			
 
				+        for dim_index in range(1, len(grid_scores) - 1):
			
 
				+            for score, uid_endpoint in cls._iterate_matching_experts(beam, grid_scores):
			
 
				+                if uid_endpoint.uid not in unique_experts:
			
 
				+                    push_and_maybe_pop = heapq.heappush if len(best_experts_heap) < beam_size else heapq.heappushpop
			
 
				+                    push_and_maybe_pop(best_experts_heap, (score, uid_endpoint))
			
 
				+                    unique_experts.add(uid_endpoint.uid)
			
 
				+
			
 
				+            # form new beam using successors from the current beam
			
 
				+            dim_scores = grid_scores[dim_index]
			
 
				+            best_active_pairs: List[Tuple[Score, ExpertPrefix]] = heapq.nlargest(beam_size, (
			
 
				+                (prefix_score + dim_scores[next_coord], f"{prefix}{next_coord}{UID_DELIMITER}")
			
 
				+                for prefix_score, prefix, suffixes in beam for next_coord in suffixes.keys()
			
 
				+                if isinstance(next_coord, int) and 0 <= next_coord < len(dim_scores)))
			
 
				+            _, best_uid_prefixes = zip(*best_active_pairs)
			
 
				+
			
 
				+            # search DHT for next step suffixes
			
 
				+            successors = await cls._get_active_successors(dht, node, best_uid_prefixes, grid_size=None,
			
 
				+                                                          negative_caching=negative_caching, num_workers=num_workers)
			
 
				+            beam = [(score, prefix, successors[prefix]) for score, prefix in best_active_pairs if successors[prefix]]
			
 
				+            if not beam:
			
 
				+                logger.warning(f"Beam search had to terminate prematurely because of empty beam (dim 0)")
			
 
				+                break
			
 
				+
			
 
				+        # add best experts from the final beam
			
 
				+        for score, uid_endpoint in cls._iterate_matching_experts(beam, grid_scores):
			
 
				+            if uid_endpoint.uid not in unique_experts:
			
 
				+                push_and_maybe_pop = heapq.heappush if len(best_experts_heap) < beam_size else heapq.heappushpop
			
 
				+                push_and_maybe_pop(best_experts_heap, (score, uid_endpoint))
			
 
				+                unique_experts.add(uid_endpoint.uid)
			
 
				+
			
 
				+        best_experts = [RemoteExpert(*uid_endpoint) for score, uid_endpoint in sorted(best_experts_heap, reverse=True)]
			
 
				+        return best_experts
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _iterate_matching_experts(beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]],
			
 
				+                                  grid_scores: Sequence[Sequence[float]]) -> Iterator[Tuple[Score, UidEndpoint]]:
			
 
				+        """ iterate over all exemplar experts attached to current beam """
			
 
				+        for score, prefix, suffixes in beam:
			
 
				+            for next_coord, match in suffixes.items():
			
 
				+                if len(grid_scores) == 1 and next_coord == FLAT_EXPERT:
			
 
				+                    yield score, match
			
 
				+                elif isinstance(match.uid, ExpertUID) and match.uid.count(UID_DELIMITER) == len(grid_scores):
			
 
				+                    expert_coords = match.uid.split(UID_DELIMITER)[1:]
			
 
				+                    if all(coord.isdigit() and 0 <= int(coord) < len(grid_scores[i])
			
 
				+                           for i, coord in enumerate(expert_coords)):
			
 
				+                        expert_score = sum(scores[coord] for scores, coord in zip(grid_scores, map(int, expert_coords)))
			
 
				+                        yield expert_score, match
			
 
				+                    else:
			
 
				+                        logger.warning(f"Found incompatible expert coordinates: {expert_coords}")
			
 
				+                else:
			
 
				+                    logger.warning(f"Found incompatible expert UID: {match.uid}")
			
 
				+
			
 
				+    def batch_find_best_experts(self, batch_grid_scores: Sequence[Sequence[Sequence[float]]], beam_size: int,
			
 
				+                                return_future: bool = False) -> Union[List[List[RemoteExpert]], MPFuture]:
			
 
				+        """
			
 
				+        Find and return :beam_size: active experts with highest scores, use both local cache and DHT
			
 
				+
			
 
				+        :param batch_grid_scores: scores predicted for each batch example and each dimension in the grid,
			
 
				+        :type batch_grid_scores: list of arrays of shape (batch_size, grid_size[i])
			
 
				+        :param beam_size: how many best experts should beam search return
			
 
				+         After time_budget is reached, beam search won't search for more experts and instead fall back on local cache
			
 
				+         Please note that any queries that fall outside the budget will still be performed in background and cached
			
 
				+         for subsequent iterations as long as DHTNode.cache_locally is True
			
 
				+        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				+        :returns: a list that contains *up to* k_best RemoteExpert instances
			
 
				+        """
			
 
				+        return self.dht.run_coroutine(partial(
			
 
				+            self._batch_find_best_experts, prefix=self.uid_prefix, batch_grid_scores=batch_grid_scores,
			
 
				+            beam_size=beam_size, negative_caching=self.negative_caching, num_workers=self.num_workers), return_future)
			
 
				+
			
 
				+    @classmethod
			
 
				+    async def _batch_find_best_experts(
			
 
				+            cls, dht: DHT, node: DHTNode, prefix: str, batch_grid_scores: Sequence[Sequence[Tuple[float]]],
			
 
				+            beam_size: int, negative_caching: bool, num_workers: Optional[int]) -> Sequence[Sequence[RemoteExpert]]:
			
 
				+        batch_grid_scores = [[tuple(grid_score[i]) for grid_score in batch_grid_scores]
			
 
				+                             for i in range(len(batch_grid_scores[0]))]
			
 
				+        coros = [cls._find_best_experts(dht, node, prefix, grid_scores, beam_size, negative_caching, num_workers)
			
 
				+                 for grid_scores in batch_grid_scores]
			
 
				+        return await asyncio.gather(*coros)
			
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -12,6 +12,8 @@ from torch.autograd.function import once_differentiable
 
				 
			
 
				 import hivemind
			
 
				 from hivemind.client.expert import RemoteExpert, DUMMY, _get_expert_stub
			
 
				+from hivemind.server.expert_uid import UID_DELIMITER
			
 
				+from hivemind.client.beam_search import MoEBeamSearcher
			
 
				 from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
			
 
				 from hivemind.utils import nested_pack, nested_flatten, serialize_torch_tensor, deserialize_torch_tensor
			
 
				 from hivemind.utils.logging import get_logger
			
@@ -45,11 +47,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
				                  backward_k_min: int = 1, backward_timeout: Optional[float] = None, detect_anomalies: bool = False,
			
 
				                  **dht_kwargs):
			
 
				         super().__init__()
			
 
				-        if not uid_prefix.endswith(hivemind.dht.UID_DELIMITER):
			
 
				-            uid_prefix += hivemind.dht.UID_DELIMITER
			
 
				-            logger.info(f"Prefix must end with '{hivemind.dht.UID_DELIMITER}'. New prefix: '{uid_prefix}' .")
			
 
				-        assert hivemind.dht.is_valid_prefix(uid_prefix), f"Prefix '{uid_prefix}' is invalid."
			
 
				-        self.dht, self.grid_size, self.uid_prefix, self.dht_kwargs = dht, grid_size, uid_prefix, dht_kwargs
			
 
				+        self.dht = dht
			
 
				+        self.beam_search = MoEBeamSearcher(dht, uid_prefix, grid_size, **dht_kwargs)
			
 
				         self.k_best, self.k_min, self.backward_k_min = k_best, k_min, backward_k_min
			
 
				         self.forward_timeout, self.backward_timeout = forward_timeout, backward_timeout
			
 
				         self.timeout_after_k_min = timeout_after_k_min
			
@@ -75,10 +74,10 @@ class RemoteMixtureOfExperts(nn.Module):
 
				             input_for_gating = input
			
 
				 
			
 
				         # 1. compute scores and find most appropriate experts with beam search
			
 
				-        grid_scores = self.proj(input_for_gating).split_with_sizes(self.grid_size, dim=-1)
			
 
				+        grid_scores = self.proj(input_for_gating).split_with_sizes(self.beam_search.grid_size, dim=-1)
			
 
				 
			
 
				-        chosen_experts: List[List[RemoteExpert]] = self.dht.batch_find_best_experts(
			
 
				-            self.uid_prefix, [scores.detach().cpu().numpy() for scores in grid_scores], self.k_best, **self.dht_kwargs)
			
 
				+        chosen_experts: List[List[RemoteExpert]] = self.beam_search.batch_find_best_experts(
			
 
				+            [scores.detach().cpu().numpy() for scores in grid_scores], self.k_best)
			
 
				 
			
 
				         if self._expert_info is None:
			
 
				             try:
			
@@ -121,8 +120,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
				 
			
 
				         grid_indices = torch.zeros([len(flat_experts), len(grid_scores)], dtype=torch.int64)
			
 
				         for i, expert in enumerate(flat_experts):
			
 
				-            expert_indices = expert.uid[len(self.uid_prefix):]
			
 
				-            expert_indices = list(map(int, expert_indices.split(hivemind.dht.UID_DELIMITER)))
			
 
				+            expert_indices = expert.uid[len(self.beam_search.uid_prefix):]
			
 
				+            expert_indices = list(map(int, expert_indices.split(UID_DELIMITER)))
			
 
				             grid_indices[i] = torch.as_tensor(expert_indices, dtype=grid_indices.dtype)
			
 
				 
			
 
				         scores_per_dim = [
			
@@ -140,8 +139,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
				             # grab some expert to set ensemble output shape
			
 
				             proj_device = self.proj.weight.device
			
 
				             dummy_scores_concat = self.proj(torch.randn(1, self.proj.in_features, device=proj_device))
			
 
				-            dummy_scores = dummy_scores_concat.cpu().split_with_sizes(self.grid_size, dim=-1)
			
 
				-            dummy_experts = self.loop.run_until_complete(self.beam_search(dummy_scores, k_best=1))
			
 
				+            dummy_scores = dummy_scores_concat.cpu().split_with_sizes(self.beam_search.grid_size, dim=-1)
			
 
				+            dummy_experts = self.beam_search.find_best_experts(dummy_scores, beam_size=1)
			
 
				             self._expert_info = dummy_experts[0].info
			
 
				         return self._expert_info
			
 
				 
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -15,53 +15,27 @@ The code is organized as follows:
 
				 from __future__ import annotations
			
 
				 import asyncio
			
 
				 import ctypes
			
 
				-import heapq
			
 
				 import multiprocessing as mp
			
 
				-import re
			
 
				-from collections import deque
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				-from typing import List, Tuple, Optional, Sequence, Union, Dict, Deque, NamedTuple, Iterator, Set
			
 
				-
			
 
				+from typing import List, Optional, Sequence, Union, Callable, Awaitable, TypeVar
			
 
				 
			
 
				+import hivemind
			
 
				 from hivemind.client import RemoteExpert
			
 
				 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
			
 
				-from hivemind.dht.routing import get_dht_time, DHTValue, DHTKey, Subkey
			
 
				-from hivemind.utils import MPFuture, Endpoint, Hostname, get_logger, switch_to_uvloop, strip_port, ValueWithExpiration
			
 
				+from hivemind.dht.routing import DHTValue, DHTKey, Subkey
			
 
				+from hivemind.utils.networking import Hostname, Endpoint, strip_port
			
 
				+from hivemind.utils import MPFuture, get_logger, switch_to_uvloop, ValueWithExpiration, await_cancelled, get_dht_time
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
 
				-ExpertUID, ExpertPrefix, Coordinate, Score = str, str, int, float
			
 
				-UidEndpoint = NamedTuple("UidEndpoint", [('uid', ExpertUID), ('endpoint', Endpoint)])
			
 
				-UID_DELIMITER = '.'  # when declaring experts, DHT store all prefixes of that expert's uid, split over this prefix
			
 
				-FLAT_EXPERT = -1  # grid prefix reserved for storing 1d expert uids. Used to speed up find_best_experts in 1d case.
			
 
				-UID_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))+$')  # e.g. ffn_expert.98.76.54 - prefix + some dims
			
 
				-PREFIX_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))*[.]$')  # e.g. expert. or ffn.45. (ends with ".")
			
 
				-#  formally, prefixes = {uid.split(UID_DELIMITER)[:length] for length in range(1, uid.count(UID_DELIMITER) + 2)}
			
 
				-
			
 
				-
			
 
				-def is_valid_uid(maybe_uid: str) -> bool:
			
 
				-    """ An uid must contain a string expert type, followed by one or more .-separated numeric indices """
			
 
				-    return bool(UID_PATTERN.fullmatch(maybe_uid))
			
 
				-
			
 
				-
			
 
				-def is_valid_prefix(maybe_prefix: str) -> bool:
			
 
				-    """ An uid prefix must contain a string expert type, followed by optional numeric indices and a trailing period """
			
 
				-    return bool(PREFIX_PATTERN.fullmatch(maybe_prefix))
			
 
				-
			
 
				-
			
 
				-def split_uid(uid_or_prefix: Union[ExpertUID, ExpertPrefix]) -> Tuple[ExpertPrefix, Coordinate]:
			
 
				-    """ Separate an expert UID or prefix into a new ExpertPrefix and integer for the last coordinate """
			
 
				-    uid_or_prefix = uid_or_prefix.rstrip(UID_DELIMITER)
			
 
				-    pivot = uid_or_prefix.rindex(UID_DELIMITER) + 1
			
 
				-    return uid_or_prefix[:pivot], int(uid_or_prefix[pivot:])
			
 
				+ReturnType = TypeVar('ReturnType')
			
 
				 
			
 
				 
			
 
				 class DHT(mp.Process):
			
 
				     """
			
 
				-    High-level interface to hivemind.dht that is designed to allow RemoteMixtureOfExperts to select best experts.
			
 
				-
			
 
				-    * hivemind servers periodically announce their experts via DHT.declare_experts
			
 
				-    * trainers find most suitable experts via DHT.find_best_experts
			
 
				+    A high-level interface to a hivemind DHT that runs a single DHT node in a background process.
			
 
				+    * hivemind servers periodically announce their experts via declare_experts (dht_handler.py)
			
 
				+    * trainers find most suitable experts via RemoteMixtureOfExperts (beam_search.py)
			
 
				 
			
 
				     :param initial_peers: one or multiple endpoints pointing to active DHT peers. Similar format to listen_on.
			
 
				     :param listen_on: an interface for incoming connections, e.g. "127.0.0.1:*", "0.0.0.0:1234" or "ipv6:[::]:*"
			
@@ -70,60 +44,17 @@ class DHT(mp.Process):
 
				     :param max_workers: declare_experts and get_experts will use up to this many parallel workers
			
 
				         (but no more than one per key)
			
 
				     :param expiration: experts declared from this node expire after this many seconds (default = 5 minutes)
			
 
				-    :param receiver_threads: uses this many threads to await on input pipe. Default = 1 should be enough in most cases
			
 
				-    :param negative_caching: if True, whenever DHT is unable to find an expert or prefix, it will cache the "no key"
			
 
				-      result inside the DHT for :expiration: seconds. Caching only affects beam search and has three main effects:
			
 
				-
			
 
				-      1. Faster beam search under node failures: if there are inconsistencies in DHT keys, such as a prefix pointing to
			
 
				-         a now-defunct expert, these inconsistencies will be overwritten by the first peer that stumbles upon them. As a
			
 
				-         result, beam search will not have to wait for non-existent experts until the expiration of their DHT entries;
			
 
				-      2. Delayed expert availability: Without negative cache, new experts are always immediately available for beam
			
 
				-         search after they are published to the DHT. With negative cache, there are rare cases (e.g. when adding new
			
 
				-         experts in place of recently defunct ones) when new experts will be initially invisible, but gradually become
			
 
				-         visible to more peers as those peers refresh their cache. This process takes at most :expiration: seconds;
			
 
				-      3. Faster beam search in very sparse grids: there is one edge case where negative cache will improve beam search
			
 
				-         performance; If an expert grid is very sparse, there can be empty indices in the first grid dimension (i.e.
			
 
				-         indices {i} such that _no_ experts that start with "{prefix}.{i}.*"). If so, the default beam search will
			
 
				-         be very slow due to the way it forms initial beam. Beam search with negative cache enabled will run normally.
			
 
				-         Though, this is a pathological case (e.g. only 90 experts in an oversized 100x100 grid) that should be avoided.
			
 
				-
			
 
				     :param kwargs: any other params will be forwarded to DHTNode upon creation
			
 
				-
			
 
				-    Each expert has an identifier in the form of {prefix}.{i}.{j}.{...}, e.g. "ffn_expert.98.76.54.32.10"
			
 
				-    An expert identifier consists of:
			
 
				-
			
 
				-        * optional prefix that determines expert role, experiment name, etc.
			
 
				-        * one or more integers that determine that expert's position in an N-dimensional grid
			
 
				-
			
 
				-    A hivemind.Server can ``DHT.declare_experts(expert_uids: List[str])`` to make its experts visible to everyone.
			
 
				-    When declaring experts, DHT will store each expert's uid and all its prefixes until :expiration: (specified at init)
			
 
				-    For instance, declaring "ffn_expert.98.76.54.32.10" will store the following keys in a DHT:
			
 
				-    ``"ffn_expert.98", "ffn_expert.98.76", "ffn_expert.98.76.54", ..., "ffn_expert.98.76.54.32.10"``
			
 
				-
			
 
				-    In order to enable fast beam search, DHT maintains dictionaries of all active suffixes for every prefix
			
 
				-    (e.g. "ffn_expert.98": {76: ffn_expert.98.76...., 123: ffn_expert.98.123..., 225: ffn_expert.98.225....}))
			
 
				-
			
 
				-    RemoteMixtureOfExperts can use these prefixes to find top-k most suitable experts with a left-to-right beam search.
			
 
				-    For instance, consider RemoteMixtureOfExperts with prefix "ffn_expert" and grid size [100, 100, 100, 100, 100].
			
 
				-    This MoE can query all experts with that prefix and arbitrary indices in 0...99 along each dimension.
			
 
				-    However, not every expert in such 100^5 grid can be alive at a given moment of time (the grid size is redundant).
			
 
				-    In order to find k best "alive" experts, MoE first ranks indices along the first dimension with its gating function.
			
 
				-    It can then check which of those indices correspond to "alive" experts by querying keys such as "ffn_expert.98".
			
 
				-
			
 
				-    After selecting k best indices along first dimension, MoE moves to the second dimension.
			
 
				-    It can find top-k index pairs (e.g. "expert.98.76") that use one of k best indices from the previous step.
			
 
				-    This beam search explores one additional dimension per step and finds k best experts from across the DHT
			
 
				-    in O(k * num_dimensions * dimension_size) time depending on the chosen grid dimensions.
			
 
				     """
			
 
				 
			
 
				     def __init__(self, listen_on: Endpoint = "0.0.0.0:*", initial_peers: Sequence[Endpoint] = (), *, start: bool,
			
 
				                  daemon: bool = True, max_workers: Optional[int] = None, parallel_rpc: Optional[int] = None,
			
 
				-                 receiver_threads: int = 1, negative_caching: bool = True, expiration: float = 300, **kwargs):
			
 
				+                 expiration: float = 300, **kwargs):
			
 
				         super().__init__()
			
 
				         assert not isinstance(initial_peers, str), "please specify a list/tuple of initial peers (even if there's one)"
			
 
				         self.listen_on, self.initial_peers, self.kwargs = listen_on, initial_peers, kwargs
			
 
				-        self.receiver_threads, self.max_workers, self.parallel_rpc = receiver_threads, max_workers, parallel_rpc
			
 
				-        self.expiration, self.negative_caching = expiration, negative_caching
			
 
				+        self.max_workers, self.parallel_rpc = max_workers, parallel_rpc
			
 
				+        self.default_expiration = expiration
			
 
				         self._port = mp.Value(ctypes.c_int32, 0)  # initialized after dht starts
			
 
				         self._pipe, self.pipe = mp.Pipe(duplex=True)
			
 
				         self.ready = mp.Event()
			
@@ -134,7 +65,7 @@ class DHT(mp.Process):
 
				     def run(self) -> None:
			
 
				         """ Serve DHT forever. This function will not return until DHT node is shut down """
			
 
				         loop = switch_to_uvloop()
			
 
				-        pipe_awaiter = ThreadPoolExecutor(self.receiver_threads)
			
 
				+        pipe_awaiter = ThreadPoolExecutor(max_workers=1)
			
 
				 
			
 
				         async def _run():
			
 
				             node = await DHTNode.create(
			
@@ -201,7 +132,11 @@ class DHT(mp.Process):
 
				               subkey: Optional[Subkey] = None, return_future: bool = False, **kwargs) -> Union[bool, MPFuture]:
			
 
				         """
			
 
				         Find num_replicas best nodes to store (key, value) and store it there until expiration time.
			
 
				-        :note: store is a simplified interface to store_many, all kwargs are be forwarded there
			
 
				+
			
 
				+        :param key: msgpack-serializable key to be associated with value until expiration.
			
 
				+        :param value: msgpack-serializable value to be stored under a given key until expiration.
			
 
				+        :param expiration_time: absolute time when the entry should expire, based on hivemind.get_dht_time()
			
 
				+        :param subkey: if specified, add a value under that subkey instead of overwriting key (see DHTNode.store_many)
			
 
				         :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				         :returns: True if store succeeds, False if it fails (due to no response or newer value)
			
 
				         """
			
@@ -221,6 +156,39 @@ class DHT(mp.Process):
 
				                 future.set_exception(e)
			
 
				             raise
			
 
				 
			
 
				+    def run_coroutine(self, coro: Callable[[DHT, DHTNode], Awaitable[ReturnType]],
			
 
				+                      return_future: bool = False) -> Union[ReturnType, MPFuture[ReturnType]]:
			
 
				+        """
			
 
				+        Execute an asynchronous function on a DHT participant and return results. This is meant as an interface
			
 
				+         for running custom functions DHT for special cases (e.g. declare experts, beam search)
			
 
				+
			
 
				+        :param coro: async function to be executed. Receives 2 arguments: this DHT daemon and a running DHTNode
			
 
				+        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+        :returns: coroutine outputs or MPFuture for these outputs
			
 
				+        :note: the coroutine will be executed inside the DHT process. As such, any changes to global variables or
			
 
				+          DHT fields made by this coroutine will not be accessible from the host process.
			
 
				+        :note: all time-consuming operations in coro should be asynchronous (e.g. asyncio.sleep instead of time.sleep)
			
 
				+          or use asyncio.get_event_loop().run_in_executor(...) to prevent coroutine from blocking background DHT tasks
			
 
				+        :note: when run_coroutine is called with wait=False, MPFuture can be cancelled to interrupt the task.
			
 
				+        """
			
 
				+        future, _future = MPFuture.make_pair()
			
 
				+        self.pipe.send(('_run_coroutine', [], dict(coro=coro, future=_future)))
			
 
				+        return future if return_future else future.result()
			
 
				+
			
 
				+    async def _run_coroutine(self, node: DHTNode, coro: Callable[[DHT, DHTNode], Awaitable[ReturnType]],
			
 
				+                             future: MPFuture[ReturnType]):
			
 
				+        main_task = asyncio.create_task(coro(self, node))
			
 
				+        cancel_task = asyncio.create_task(await_cancelled(future))
			
 
				+        try:
			
 
				+            await asyncio.wait({main_task, cancel_task}, return_when=asyncio.FIRST_COMPLETED)
			
 
				+            if future.cancelled():
			
 
				+                main_task.cancel()
			
 
				+            else:
			
 
				+                future.set_result(await main_task)
			
 
				+        except BaseException as e:
			
 
				+            if not future.done():
			
 
				+                future.set_exception(e)
			
 
				+
			
 
				     def get_visible_address(self, num_peers: Optional[int] = None, peers: Sequence[Endpoint] = ()) -> Hostname:
			
 
				         """
			
 
				         Get this machine's visible address by requesting other peers or using pre-specified network addresses.
			
@@ -274,289 +242,11 @@ class DHT(mp.Process):
 
				             future.set_exception(ValueError(f"Can't get address: DHT node has no peers and no public endpoint."
			
 
				                                             f" Please ensure the node is connected or specify peers=... manually."))
			
 
				 
			
 
				-    def declare_experts(self, uids: Sequence[ExpertUID], endpoint: Endpoint, wait: bool = True,
			
 
				-                        timeout: Optional[float] = None) -> Dict[ExpertUID, bool]:
			
 
				-        """
			
 
				-        Make experts visible to all DHT peers; update timestamps if declared previously.
			
 
				+    def declare_experts(self, uids, endpoint, wait: bool = True):
			
 
				+        logger.warning("dht.declare_experts is scheduled for removal in 0.9.8, please use hivemind.declare_experts.",)
			
 
				+        return hivemind.declare_experts(self, uids, endpoint, wait=wait)
			
 
				 
			
 
				-        :param uids: a list of expert ids to update
			
 
				-        :param endpoint: endpoint that serves these experts, usually your server endpoint (e.g. "201.111.222.333:1337")
			
 
				-        :param wait: if True, awaits for declaration to finish, otherwise runs in background
			
 
				-        :param timeout: waits for the procedure to finish for up to this long, None means wait indefinitely
			
 
				-        :returns: if wait, returns store status for every key (True = store succeeded, False = store rejected)
			
 
				-        """
			
 
				-        assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				-        for uid in uids:
			
 
				-            assert is_valid_uid(uid), f"{uid} is not a valid expert uid. All uids must follow {UID_PATTERN.pattern}"
			
 
				-        future, _future = MPFuture.make_pair() if wait else (None, None)
			
 
				-        self.pipe.send(('_declare_experts', [], dict(uids=list(uids), endpoint=endpoint, future=_future)))
			
 
				-        if wait:
			
 
				-            return future.result(timeout)
			
 
				-
			
 
				-    async def _declare_experts(self, node: DHTNode, uids: List[ExpertUID], endpoint: Endpoint,
			
 
				-                               future: Optional[MPFuture]) -> Dict[ExpertUID, bool]:
			
 
				-        num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
			
 
				-        expiration_time = get_dht_time() + self.expiration
			
 
				-        data_to_store: Dict[Tuple[ExpertPrefix, Optional[Coordinate]], DHTValue] = {}
			
 
				-        for uid in uids:
			
 
				-            data_to_store[uid, None] = endpoint
			
 
				-            prefix = uid if uid.count(UID_DELIMITER) > 1 else f'{uid}{UID_DELIMITER}{FLAT_EXPERT}'
			
 
				-            for i in range(prefix.count(UID_DELIMITER) - 1):
			
 
				-                prefix, last_coord = split_uid(prefix)
			
 
				-                data_to_store[prefix, last_coord] = [uid, endpoint]
			
 
				-
			
 
				-        keys, maybe_subkeys, values = zip(*((key, subkey, value) for (key, subkey), value in data_to_store.items()))
			
 
				-        store_ok = await node.store_many(keys, values, expiration_time, subkeys=maybe_subkeys, num_workers=num_workers)
			
 
				-        if future is not None:
			
 
				-            future.set_result(store_ok)
			
 
				-        return store_ok
			
 
				-
			
 
				-    def get_experts(self, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration] = None,
			
 
				+    def get_experts(self, uids, expiration_time: Optional[DHTExpiration] = None,
			
 
				                     return_future: bool = False) -> List[Optional[RemoteExpert]]:
			
 
				-        """
			
 
				-        :param uids: find experts with these ids from across the DHT
			
 
				-        :param expiration_time: if specified, return experts that expire no sooner than this (based on get_dht_time)
			
 
				-        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				-        :returns: a list of [RemoteExpert if found else None]
			
 
				-        """
			
 
				-        assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_get_experts', [], dict(uids=list(uids), expiration_time=expiration_time, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _get_experts(self, node: DHTNode, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration],
			
 
				-                           future: Optional[MPFuture] = None) -> List[Optional[RemoteExpert]]:
			
 
				-        if expiration_time is None:
			
 
				-            expiration_time = get_dht_time()
			
 
				-        num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
			
 
				-        found: Dict[ExpertUID, DHTValue] = await node.get_many(uids, expiration_time, num_workers=num_workers)
			
 
				-
			
 
				-        experts: List[Optional[RemoteExpert]] = [None] * len(uids)
			
 
				-        for i, uid in enumerate(uids):
			
 
				-            if found[uid] is not None and isinstance(found[uid].value, Endpoint):
			
 
				-                experts[i] = RemoteExpert(uid, found[uid].value)
			
 
				-        if future:
			
 
				-            future.set_result(experts)
			
 
				-        return experts
			
 
				-
			
 
				-    def get_initial_beam(self, prefix: ExpertPrefix, scores: Sequence[float], beam_size: int,
			
 
				-                         num_workers: Optional[int] = None, return_future: bool = False
			
 
				-                         ) -> List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]]:
			
 
				-        """
			
 
				-        :param prefix: search for experts whose uids start with this prefix
			
 
				-        :param scores: prefer suffix coordinates that have highest scores
			
 
				-        :param beam_size: select this many active suffixes with highest scores
			
 
				-        :param num_workers: maintain up to this many concurrent DHT searches
			
 
				-        :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				-        :returns: a list of up to beam_size tuples of (prefix score, prefix itself, dict{suffix: example expert})
			
 
				-        """
			
 
				-        assert is_valid_prefix(prefix), f"prefix '{prefix}' is invalid, it must follow {PREFIX_PATTERN.pattern}"
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_get_initial_beam', [], dict(prefix=prefix, scores=tuple(scores), beam_size=beam_size,
			
 
				-                                                      num_workers=num_workers, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _get_initial_beam(self, node, prefix: ExpertPrefix, beam_size: int, scores: Tuple[float, ...],
			
 
				-                                num_workers: Optional[int] = None, future: Optional[MPFuture] = None
			
 
				-                                ) -> List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]]:
			
 
				-        num_workers = num_workers or self.max_workers or beam_size
			
 
				-        beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]] = []
			
 
				-        unattempted_indices: List[Coordinate] = sorted(range(len(scores)), key=scores.__getitem__)  # from worst to best
			
 
				-        pending_tasks: Deque[Tuple[Coordinate, ExpertPrefix, asyncio.Task]] = deque()
			
 
				-
			
 
				-        while len(beam) < beam_size and (unattempted_indices or pending_tasks):
			
 
				-            # dispatch additional tasks
			
 
				-            while unattempted_indices and len(pending_tasks) < num_workers:
			
 
				-                next_index = unattempted_indices.pop()  # note: this is best unattempted index because of sort order
			
 
				-                next_best_prefix = f"{prefix}{next_index}{UID_DELIMITER}"
			
 
				-                pending_tasks.append((next_index, next_best_prefix, asyncio.create_task(node.get(next_best_prefix))))
			
 
				-
			
 
				-            # await the next best prefix to be fetched
			
 
				-            pending_best_index, pending_best_prefix, pending_task = pending_tasks.popleft()
			
 
				-            try:
			
 
				-                maybe_prefix_data = await pending_task
			
 
				-                if maybe_prefix_data is not None and isinstance(maybe_prefix_data.value, dict):
			
 
				-                    successors = {coord: UidEndpoint(*match.value) for coord, match in maybe_prefix_data.value.items()
			
 
				-                                  if isinstance(coord, Coordinate) and isinstance(getattr(match, 'value', None), list)
			
 
				-                                  and len(match.value) == 2}
			
 
				-                    if successors:
			
 
				-                        beam.append((scores[pending_best_index], pending_best_prefix, successors))
			
 
				-                elif maybe_prefix_data is None and self.negative_caching:
			
 
				-                    logger.debug(f"DHT negative caching: storing a 'no prefix' entry for {pending_best_prefix}")
			
 
				-                    asyncio.create_task(node.store(pending_best_prefix, subkey=-1, value=None,
			
 
				-                                                   expiration_time=get_dht_time() + self.expiration))
			
 
				-
			
 
				-            except asyncio.CancelledError:
			
 
				-                for _, pending_task in pending_tasks:
			
 
				-                    pending_task.cancel()
			
 
				-                raise
			
 
				-        if future:
			
 
				-            future.set_result(beam)
			
 
				-        return beam
			
 
				-
			
 
				-    def get_active_successors(self, prefixes: List[ExpertPrefix], grid_size: Optional[int] = None,
			
 
				-                              num_workers: Optional[int] = None, return_future: bool = False
			
 
				-                              ) -> Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]]:
			
 
				-        """
			
 
				-        :param prefixes: a list of prefix for which to find active successor uids
			
 
				-        :param grid_size: if specified, only return successors if ther are in range [0, grid_size)
			
 
				-        :param num_workers: how many parallel workers to use for DHTNode.get_many
			
 
				-        :param return_future: if False (default), find and return successors. Otherwise return MPFuture and fill later.
			
 
				-        :returns: for every expert, return a dict{active_next_coordinate: (matching_expert_uid, matching_endpoint)}
			
 
				-        :note: if a prefix is not found, get_active_successors will return an empty dictionary for that prefix
			
 
				-        """
			
 
				-        assert not isinstance(prefixes, str), "Please send a list / tuple of expert prefixes."
			
 
				-        for prefix in prefixes:
			
 
				-            assert is_valid_prefix(prefix), f"prefix '{prefix}' is invalid, it must follow {PREFIX_PATTERN.pattern}"
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_get_active_successors', [], dict(
			
 
				-            prefixes=list(prefixes), grid_size=grid_size, num_workers=num_workers, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _get_active_successors(self, node: DHTNode, prefixes: List[ExpertPrefix], grid_size: Optional[int] = None,
			
 
				-                                     num_workers: Optional[int] = None, future: Optional[MPFuture] = None
			
 
				-                                     ) -> Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]]:
			
 
				-        grid_size = grid_size or float('inf')
			
 
				-        num_workers = num_workers or min(len(prefixes), self.max_workers or len(prefixes))
			
 
				-        dht_responses = await node.get_many(keys=prefixes, num_workers=num_workers)
			
 
				-        successors: Dict[ExpertPrefix, Dict[Coordinate, UidEndpoint]] = {}
			
 
				-        for prefix, found in dht_responses.items():
			
 
				-            if found and isinstance(found.value, dict):
			
 
				-                successors[prefix] = {coord: UidEndpoint(*match.value) for coord, match in found.value.items()
			
 
				-                                      if isinstance(coord, Coordinate) and 0 <= coord < grid_size
			
 
				-                                      and isinstance(getattr(match, 'value', None), list) and len(match.value) == 2}
			
 
				-            else:
			
 
				-                successors[prefix] = {}
			
 
				-                if found is None and self.negative_caching:
			
 
				-                    logger.debug(f"DHT negative caching: storing a 'no prefix' entry for {prefix}")
			
 
				-                    asyncio.create_task(node.store(prefix, subkey=-1, value=None,
			
 
				-                                                   expiration_time=get_dht_time() + self.expiration))
			
 
				-        if future:
			
 
				-            future.set_result(successors)
			
 
				-        return successors
			
 
				-
			
 
				-    def find_best_experts(self, prefix: ExpertPrefix, grid_scores: Sequence[Sequence[float]], beam_size: int,
			
 
				-                          num_workers: Optional[int] = None, return_future: bool = False
			
 
				-                          ) -> Union[List[RemoteExpert], MPFuture]:
			
 
				-        """
			
 
				-        Find and return :beam_size: active experts with highest scores, use both local cache and DHT
			
 
				-
			
 
				-        :param prefix: common prefix for all expert uids in grid
			
 
				-        :param grid_scores: scores predicted for each dimension in the grid,
			
 
				-        :type grid_scores: model scores for each grid dimension, list of arrays of shape grid_size[i]
			
 
				-        :param beam_size: how many best experts should beam search return
			
 
				-         After time_budget is reached, beam search won't search for more experts and instead fall back on local cache
			
 
				-         Please note that any queries that fall outside the budget will still be performed in background and cached
			
 
				-         for subsequent iterations as long as DHTNode.cache_locally is True
			
 
				-        :param num_workers: use up to this many concurrent workers to search DHT
			
 
				-        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				-        :returns: a list that contains *up to* k_best RemoteExpert instances
			
 
				-        """
			
 
				-        assert len(grid_scores) > 0 and beam_size > 0
			
 
				-        assert is_valid_prefix(prefix), f"prefix '{prefix}' is invalid, it must follow {PREFIX_PATTERN.pattern}"
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_find_best_experts', [], dict(prefix=prefix, grid_scores=list(map(tuple, grid_scores)),
			
 
				-                                                       beam_size=beam_size, num_workers=num_workers, future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _find_best_experts(
			
 
				-            self, node: DHTNode, prefix: str, grid_scores: List[Tuple[float]], beam_size: int,
			
 
				-            num_workers: Optional[int] = None, future: Optional[MPFuture] = None, **kwargs) -> List[RemoteExpert]:
			
 
				-        num_workers = num_workers or min(beam_size, self.max_workers or beam_size)
			
 
				-
			
 
				-        # form initial beam from top-k active L1 prefixes, each row is (score, uid prefix, possible suffixes)
			
 
				-        beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]] = await self._get_initial_beam(
			
 
				-            node, prefix, beam_size, grid_scores[0], min(beam_size, num_workers))
			
 
				-
			
 
				-        best_experts_heap: List[Tuple[Score, UidEndpoint]] = []  # max-heap of expert uids/endpoints ordered by scores
			
 
				-        unique_experts: Set[ExpertUID] = set()
			
 
				-
			
 
				-        for dim_index in range(1, len(grid_scores) - 1):
			
 
				-            for score, uid_endpoint in self._iterate_matching_experts(beam, grid_scores):
			
 
				-                if uid_endpoint.uid not in unique_experts:
			
 
				-                    push_and_maybe_pop = heapq.heappush if len(best_experts_heap) < beam_size else heapq.heappushpop
			
 
				-                    push_and_maybe_pop(best_experts_heap, (score, uid_endpoint))
			
 
				-                    unique_experts.add(uid_endpoint.uid)
			
 
				-
			
 
				-            # form new beam using successors from the current beam
			
 
				-            dim_scores = grid_scores[dim_index]
			
 
				-            best_active_pairs: List[Tuple[Score, ExpertPrefix]] = heapq.nlargest(beam_size, (
			
 
				-                (prefix_score + dim_scores[next_coord], f"{prefix}{next_coord}{UID_DELIMITER}")
			
 
				-                for prefix_score, prefix, suffixes in beam for next_coord in suffixes.keys()
			
 
				-                if isinstance(next_coord, int) and 0 <= next_coord < len(dim_scores)))
			
 
				-            _, best_uid_prefixes = zip(*best_active_pairs)
			
 
				-
			
 
				-            # search DHT for next step suffixes
			
 
				-            successors = await self._get_active_successors(node, best_uid_prefixes, num_workers=num_workers)
			
 
				-            beam = [(score, prefix, successors[prefix]) for score, prefix in best_active_pairs if successors[prefix]]
			
 
				-            if not beam:
			
 
				-                logger.warning(f"Beam search had to terminate prematurely because of empty beam (dim 0)")
			
 
				-                break
			
 
				-
			
 
				-        # add best experts from the final beam
			
 
				-        for score, uid_endpoint in self._iterate_matching_experts(beam, grid_scores):
			
 
				-            if uid_endpoint.uid not in unique_experts:
			
 
				-                push_and_maybe_pop = heapq.heappush if len(best_experts_heap) < beam_size else heapq.heappushpop
			
 
				-                push_and_maybe_pop(best_experts_heap, (score, uid_endpoint))
			
 
				-                unique_experts.add(uid_endpoint.uid)
			
 
				-
			
 
				-        best_experts = [RemoteExpert(*uid_endpoint) for score, uid_endpoint in sorted(best_experts_heap, reverse=True)]
			
 
				-        if future is not None:
			
 
				-            future.set_result(best_experts)
			
 
				-        return best_experts
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _iterate_matching_experts(beam: List[Tuple[Score, ExpertPrefix, Dict[Coordinate, UidEndpoint]]],
			
 
				-                                  grid_scores: Sequence[Sequence[float]]) -> Iterator[Tuple[Score, UidEndpoint]]:
			
 
				-        """ iterate over all exemplar experts attached to current beam """
			
 
				-        for score, prefix, suffixes in beam:
			
 
				-            for next_coord, match in suffixes.items():
			
 
				-                if len(grid_scores) == 1 and next_coord == FLAT_EXPERT:
			
 
				-                    yield score, match
			
 
				-                elif isinstance(match.uid, ExpertUID) and match.uid.count(UID_DELIMITER) == len(grid_scores):
			
 
				-                    expert_coords = match.uid.split(UID_DELIMITER)[1:]
			
 
				-                    if all(coord.isdigit() and 0 <= int(coord) < len(grid_scores[i])
			
 
				-                           for i, coord in enumerate(expert_coords)):
			
 
				-                        expert_score = sum(scores[coord] for scores, coord in zip(grid_scores, map(int, expert_coords)))
			
 
				-                        yield expert_score, match
			
 
				-                    else:
			
 
				-                        logger.warning(f"Found incompatible expert coordinates: {expert_coords}")
			
 
				-                else:
			
 
				-                    logger.warning(f"Found incompatible expert UID: {match.uid}")
			
 
				-
			
 
				-    def batch_find_best_experts(
			
 
				-            self, prefix: str, batch_grid_scores: Sequence[Sequence[Sequence[float]]], beam_size: int, *,
			
 
				-            workers_per_sample: Optional[int] = None, return_future=False) -> Union[List[List[RemoteExpert]], MPFuture]:
			
 
				-        """
			
 
				-        Find and return :beam_size: active experts with highest scores, use both local cache and DHT
			
 
				-
			
 
				-        :param prefix: common prefix for all expert uids in grid
			
 
				-        :param batch_grid_scores: scores predicted for each batch example and each dimension in the grid,
			
 
				-        :type batch_grid_scores: list of arrays of shape (batch_size, grid_size[i])
			
 
				-        :param beam_size: how many best experts should beam search return
			
 
				-         After time_budget is reached, beam search won't search for more experts and instead fall back on local cache
			
 
				-         Please note that any queries that fall outside the budget will still be performed in background and cached
			
 
				-         for subsequent iterations as long as DHTNode.cache_locally is True
			
 
				-        :param workers_per_sample: use up to this many concurrent workers for every sample in batch
			
 
				-        :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				-        :returns: a list that contains *up to* k_best RemoteExpert instances
			
 
				-        """
			
 
				-        future, _future = MPFuture.make_pair()
			
 
				-        self.pipe.send(('_batch_find_best_experts', [], dict(prefix=prefix, batch_grid_scores=batch_grid_scores,
			
 
				-                                                             beam_size=beam_size, workers_per_sample=workers_per_sample,
			
 
				-                                                             future=_future)))
			
 
				-        return future if return_future else future.result()
			
 
				-
			
 
				-    async def _batch_find_best_experts(
			
 
				-            self, node: DHTNode, prefix: str, batch_grid_scores: Sequence[Sequence[Tuple[float]]], beam_size: int,
			
 
				-            workers_per_sample: Optional[int] = None, future: Optional[MPFuture] = None) -> List[List[RemoteExpert]]:
			
 
				-
			
 
				-        batch_grid_scores = [[tuple(grid_score[i]) for grid_score in batch_grid_scores]
			
 
				-                             for i in range(len(batch_grid_scores[0]))]
			
 
				-        coros = [self._find_best_experts(node, prefix, grid_scores, beam_size, workers_per_sample)
			
 
				-                 for grid_scores in batch_grid_scores]
			
 
				-
			
 
				-        best_experts_batch = await asyncio.gather(*coros)
			
 
				-        if future is not None:
			
 
				-            future.set_result(best_experts_batch)
			
 
				-        return best_experts_batch
			
 
				+        logger.warning("dht.get_experts is scheduled for removal in 0.9.8, please use hivemind.get_experts.")
			
 
				+        return hivemind.get_experts(self, uids, expiration_time, return_future)
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -13,9 +13,10 @@ import torch
 
				 
			
 
				 import hivemind
			
 
				 from hivemind.dht import DHT
			
 
				+from hivemind.server.expert_uid import UID_DELIMITER
			
 
				 from hivemind.server.checkpoints import CheckpointSaver, load_weights, dir_is_correct
			
 
				 from hivemind.server.connection_handler import ConnectionHandler
			
 
				-from hivemind.server.dht_handler import DHTHandlerThread
			
 
				+from hivemind.server.dht_handler import DHTHandlerThread, declare_experts, get_experts
			
 
				 from hivemind.server.expert_backend import ExpertBackend
			
 
				 from hivemind.server.layers import name_to_block, name_to_input
			
 
				 from hivemind.server.runtime import Runtime
			
@@ -287,10 +288,10 @@ def generate_uids_from_pattern(num_experts: int, expert_pattern: Optional[str],
 
				 
			
 
				     def _generate_uid():
			
 
				         if expert_pattern is None:
			
 
				-            return f"expert{hivemind.dht.UID_DELIMITER}{attempts_per_expert * num_experts - remaining_attempts}"
			
 
				+            return f"expert{UID_DELIMITER}{attempts_per_expert * num_experts - remaining_attempts}"
			
 
				 
			
 
				         uid = []
			
 
				-        for block in expert_pattern.split(hivemind.dht.UID_DELIMITER):
			
 
				+        for block in expert_pattern.split(UID_DELIMITER):
			
 
				             try:
			
 
				                 if '[' not in block and ']' not in block:
			
 
				                     uid.append(block)
			
@@ -303,7 +304,7 @@ def generate_uids_from_pattern(num_experts: int, expert_pattern: Optional[str],
 
				                 raise e
			
 
				             except Exception as e:
			
 
				                 raise ValueError(f"Expert pattern {expert_pattern} has invalid block {block}, {e}")
			
 
				-        return hivemind.dht.UID_DELIMITER.join(uid)
			
 
				+        return UID_DELIMITER.join(uid)
			
 
				 
			
 
				     while remaining_attempts > 0 and len(found_uids) < num_experts:
			
 
				 
			
--- a/hivemind/server/dht_handler.py
+++ b/hivemind/server/dht_handler.py
@@ -1,8 +1,12 @@
 
				 import threading
			
 
				-import time
			
 
				+from functools import partial
			
 
				+from typing import Sequence, Dict, List, Tuple, Optional
			
 
				 
			
 
				-from hivemind.dht import DHT
			
 
				-from hivemind.utils import Endpoint, get_port
			
 
				+from hivemind.dht import DHT, DHTNode, DHTExpiration, DHTValue
			
 
				+from hivemind.client.expert import RemoteExpert
			
 
				+from hivemind.server.expert_uid import (ExpertUID, ExpertPrefix, FLAT_EXPERT, Coordinate,
			
 
				+                                        UID_DELIMITER, UID_PATTERN, is_valid_uid, split_uid)
			
 
				+from hivemind.utils import Endpoint, get_dht_time, get_port
			
 
				 
			
 
				 
			
 
				 class DHTHandlerThread(threading.Thread):
			
@@ -16,6 +20,65 @@ class DHTHandlerThread(threading.Thread):
 
				         self.stop = threading.Event()
			
 
				 
			
 
				     def run(self) -> None:
			
 
				-        self.dht.declare_experts(self.experts.keys(), self.endpoint)
			
 
				+        declare_experts(self.dht, self.experts.keys(), self.endpoint)
			
 
				         while not self.stop.wait(self.update_period):
			
 
				-            self.dht.declare_experts(self.experts.keys(), self.endpoint)
			
 
				+            declare_experts(self.dht, self.experts.keys(), self.endpoint)
			
 
				+
			
 
				+
			
 
				+def declare_experts(dht: DHT, uids: Sequence[ExpertUID], endpoint: Endpoint,
			
 
				+                    wait: bool = True) -> Dict[ExpertUID, bool]:
			
 
				+    """
			
 
				+    Make experts visible to all DHT peers; update timestamps if declared previously.
			
 
				+
			
 
				+    :param uids: a list of expert ids to update
			
 
				+    :param endpoint: endpoint that serves these experts, usually your server endpoint (e.g. "201.111.222.333:1337")
			
 
				+    :param wait: if True, awaits for declaration to finish, otherwise runs in background
			
 
				+    :param timeout: waits for the procedure to finish for up to this long, None means wait indefinitely
			
 
				+    :returns: if wait, returns store status for every key (True = store succeeded, False = store rejected)
			
 
				+    """
			
 
				+    assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				+    for uid in uids:
			
 
				+        assert is_valid_uid(uid), f"{uid} is not a valid expert uid. All uids must follow {UID_PATTERN.pattern}"
			
 
				+    return dht.run_coroutine(partial(_declare_experts, uids=list(uids), endpoint=endpoint), return_future=not wait)
			
 
				+
			
 
				+
			
 
				+async def _declare_experts(dht: DHT, node: DHTNode, uids: List[ExpertUID], endpoint: Endpoint) -> Dict[ExpertUID, bool]:
			
 
				+    num_workers = len(uids) if dht.max_workers is None else min(len(uids), dht.max_workers)
			
 
				+    expiration_time = get_dht_time() + dht.default_expiration  # TODO use local expiration
			
 
				+    data_to_store: Dict[Tuple[ExpertPrefix, Optional[Coordinate]], DHTValue] = {}
			
 
				+    for uid in uids:
			
 
				+        data_to_store[uid, None] = endpoint
			
 
				+        prefix = uid if uid.count(UID_DELIMITER) > 1 else f'{uid}{UID_DELIMITER}{FLAT_EXPERT}'
			
 
				+        for i in range(prefix.count(UID_DELIMITER) - 1):
			
 
				+            prefix, last_coord = split_uid(prefix)
			
 
				+            data_to_store[prefix, last_coord] = [uid, endpoint]
			
 
				+
			
 
				+    keys, maybe_subkeys, values = zip(*((key, subkey, value) for (key, subkey), value in data_to_store.items()))
			
 
				+    store_ok = await node.store_many(keys, values, expiration_time, subkeys=maybe_subkeys, num_workers=num_workers)
			
 
				+    return store_ok
			
 
				+
			
 
				+
			
 
				+def get_experts(dht: DHT, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration] = None,
			
 
				+                return_future: bool = False) -> List[Optional[RemoteExpert]]:
			
 
				+    """
			
 
				+    :param uids: find experts with these ids from across the DHT
			
 
				+    :param expiration_time: if specified, return experts that expire no sooner than this (based on get_dht_time)
			
 
				+    :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+    :returns: a list of [RemoteExpert if found else None]
			
 
				+    """
			
 
				+    assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				+    return dht.run_coroutine(partial(_get_experts, uids=list(uids), expiration_time=expiration_time), return_future)
			
 
				+
			
 
				+
			
 
				+async def _get_experts(dht: DHT, node: DHTNode, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration]
			
 
				+                       ) -> List[Optional[RemoteExpert]]:
			
 
				+    if expiration_time is None:
			
 
				+        expiration_time = get_dht_time()
			
 
				+    num_workers = len(uids) if dht.max_workers is None else min(len(uids), dht.max_workers)
			
 
				+    found: Dict[ExpertUID, DHTValue] = await node.get_many(uids, expiration_time, num_workers=num_workers)
			
 
				+
			
 
				+    experts: List[Optional[RemoteExpert]] = [None] * len(uids)
			
 
				+    for i, uid in enumerate(uids):
			
 
				+        if found[uid] is not None and isinstance(found[uid].value, Endpoint):
			
 
				+            experts[i] = RemoteExpert(uid, found[uid].value)
			
 
				+    return experts
			
--- a/hivemind/server/expert_uid.py
+++ b/hivemind/server/expert_uid.py
@@ -0,0 +1,32 @@
 
				+import re
			
 
				+from typing import NamedTuple, Union, Tuple
			
 
				+
			
 
				+from hivemind.utils.networking import Endpoint
			
 
				+
			
 
				+
			
 
				+ExpertUID, ExpertPrefix, Coordinate, Score = str, str, int, float
			
 
				+UidEndpoint = NamedTuple("UidEndpoint", [('uid', ExpertUID), ('endpoint', Endpoint)])
			
 
				+UID_DELIMITER = '.'  # when declaring experts, DHT store all prefixes of that expert's uid, split over this prefix
			
 
				+FLAT_EXPERT = -1  # grid prefix reserved for storing 1d expert uids. Used to speed up find_best_experts in 1d case.
			
 
				+UID_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))+$')  # e.g. ffn_expert.98.76.54 - prefix + some dims
			
 
				+PREFIX_PATTERN = re.compile('^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))*[.]$')  # e.g. expert. or ffn.45. (ends with ".")
			
 
				+#  formally, prefixes = {uid.split(UID_DELIMITER)[:length] for length in range(1, uid.count(UID_DELIMITER) + 2)}
			
 
				+
			
 
				+
			
 
				+def is_valid_uid(maybe_uid: str) -> bool:
			
 
				+    """ An uid must contain a string expert type, followed by one or more .-separated numeric indices """
			
 
				+    return bool(UID_PATTERN.fullmatch(maybe_uid))
			
 
				+
			
 
				+
			
 
				+def is_valid_prefix(maybe_prefix: str) -> bool:
			
 
				+    """ An uid prefix must contain a string expert type, followed by optional numeric indices and a trailing period """
			
 
				+    return bool(PREFIX_PATTERN.fullmatch(maybe_prefix))
			
 
				+
			
 
				+
			
 
				+def split_uid(uid_or_prefix: Union[ExpertUID, ExpertPrefix]) -> Tuple[ExpertPrefix, Coordinate]:
			
 
				+    """ Separate an expert UID or prefix into a new ExpertPrefix and integer for the last coordinate """
			
 
				+    uid_or_prefix = uid_or_prefix.rstrip(UID_DELIMITER)
			
 
				+    pivot = uid_or_prefix.rindex(UID_DELIMITER) + 1
			
 
				+    return uid_or_prefix[:pivot], int(uid_or_prefix[pivot:])
			
 
				+
			
 
				+
			
--- a/hivemind/utils/asyncio.py
+++ b/hivemind/utils/asyncio.py
@@ -1,4 +1,4 @@
 
				-from typing import TypeVar, AsyncIterator, Union, AsyncIterable
			
 
				+from typing import TypeVar, AsyncIterator, Union, AsyncIterable, Awaitable
			
 
				 import asyncio
			
 
				 import uvloop
			
 
				 T = TypeVar('T')
			
@@ -32,3 +32,13 @@ async def achain(*async_iters: AsyncIterable[T]) -> AsyncIterator[T]:
 
				     for aiter in async_iters:
			
 
				         async for elem in aiter:
			
 
				             yield elem
			
 
				+
			
 
				+
			
 
				+async def await_cancelled(awaitable: Awaitable) -> bool:
			
 
				+    try:
			
 
				+        await awaitable
			
 
				+        return False
			
 
				+    except asyncio.CancelledError:
			
 
				+        return True
			
 
				+    except BaseException:
			
 
				+        return False
			
--- a/hivemind/utils/mpfuture.py
+++ b/hivemind/utils/mpfuture.py
@@ -6,12 +6,14 @@ import concurrent.futures._base as base
 
				 
			
 
				 import asyncio
			
 
				 from functools import lru_cache
			
 
				-from typing import Optional, Tuple
			
 
				+from typing import Optional, Tuple, Generic, TypeVar
			
 
				 
			
 
				 from hivemind.utils.threading import run_in_background
			
 
				 
			
 
				+ResultType = TypeVar('ResultType')
			
 
				 
			
 
				-class MPFuture(base.Future):
			
 
				+
			
 
				+class MPFuture(base.Future, Generic[ResultType]):
			
 
				     """ Multiprocessing version of concurrent.futures.Future. Can also be awaited like asyncio.Future """
			
 
				 
			
 
				     TERMINAL_STATES = {base.FINISHED, base.CANCELLED, base.CANCELLED_AND_NOTIFIED}
			
@@ -74,7 +76,7 @@ class MPFuture(base.Future):
 
				         except TimeoutError:
			
 
				             pass
			
 
				 
			
 
				-    def set_result(self, result):
			
 
				+    def set_result(self, result: ResultType):
			
 
				         self._sync_updates()
			
 
				         if self._state in self.TERMINAL_STATES:
			
 
				             raise RuntimeError(f"Can't set_result to a future that is in {self._state}")
			
@@ -105,13 +107,13 @@ class MPFuture(base.Future):
 
				         self._state, self._exception = base.CANCELLED, base.CancelledError()
			
 
				         return self._send_updates()
			
 
				 
			
 
				-    def result(self, timeout: Optional[float] = None):
			
 
				+    def result(self, timeout: Optional[float] = None) -> ResultType:
			
 
				         self._await_terminal_state(timeout)
			
 
				         if self._exception is not None:
			
 
				             raise self._exception
			
 
				         return self._result
			
 
				 
			
 
				-    def exception(self, timeout=None):
			
 
				+    def exception(self, timeout=None) -> BaseException:
			
 
				         self._await_terminal_state(timeout)
			
 
				         if self._state == base.CANCELLED:
			
 
				             raise base.CancelledError()
			
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -5,6 +5,7 @@ import time
 
				 from tqdm import trange
			
 
				 
			
 
				 import hivemind
			
 
				+import hivemind.server.expert_uid
			
 
				 from hivemind.utils.threading import increase_file_limit
			
 
				 
			
 
				 logger = hivemind.get_logger(__name__)
			
@@ -42,7 +43,8 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
				     for start in trange(0, num_experts, expert_batch_size):
			
 
				         store_start = time.perf_counter()
			
 
				         endpoints.append(random_endpoint())
			
 
				-        successes = store_peer.declare_experts(expert_uids[start: start + expert_batch_size], endpoints[-1]).values()
			
 
				+        store_ok = hivemind.declare_experts(store_peer, expert_uids[start: start + expert_batch_size], endpoints[-1])
			
 
				+        successes = store_ok.values()
			
 
				         total_store_time += time.perf_counter() - store_start
			
 
				 
			
 
				         total_stores += len(successes)
			
@@ -60,7 +62,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
				 
			
 
				     for start in trange(0, len(expert_uids), expert_batch_size):
			
 
				         get_start = time.perf_counter()
			
 
				-        get_result = get_peer.get_experts(expert_uids[start: start + expert_batch_size])
			
 
				+        get_result = hivemind.get_experts(get_peer, expert_uids[start: start + expert_batch_size])
			
 
				         total_get_time += time.perf_counter() - get_start
			
 
				 
			
 
				         for i, expert in enumerate(get_result):
			
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -1,175 +1,103 @@
 
				+import asyncio
			
 
				 import random
			
 
				-import numpy as np
			
 
				+import time
			
 
				+
			
 
				 import pytest
			
 
				-import asyncio
			
 
				 
			
 
				 import hivemind
			
 
				-from hivemind import LOCALHOST, UidEndpoint, strip_port
			
 
				+from hivemind import LOCALHOST, strip_port
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_store_get_experts():
			
 
				-    peers = [hivemind.DHT(start=True)]
			
 
				+def test_get_store():
			
 
				+    peers = []
			
 
				     for i in range(10):
			
 
				         neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				         peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))
			
 
				 
			
 
				-    you: hivemind.dht.DHT = random.choice(peers)
			
 
				-    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)
			
 
				+    node1, node2 = random.sample(peers, 2)
			
 
				+    assert node1.store('key1', 'value1', expiration_time=hivemind.get_dht_time() + 30)
			
 
				+    assert node1.get('key1').value == 'value1'
			
 
				+    assert node2.get('key1').value == 'value1'
			
 
				+    assert node2.get('key2') is None
			
 
				+
			
 
				+    future = node1.get('foo', return_future=True)
			
 
				+    assert future.result() is None
			
 
				 
			
 
				-    expert_uids = [f"my_expert.{i}" for i in range(110)]
			
 
				-    batch_size = 10
			
 
				-    for batch_start in range(0, len(expert_uids), batch_size):
			
 
				-        you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost:1234')
			
 
				+    future = node1.get('foo', return_future=True)
			
 
				+    future.cancel()
			
 
				 
			
 
				-    found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
			
 
				-    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
			
 
				-    assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				+    assert node2.store('key1', 123, expiration_time=hivemind.get_dht_time() + 31)
			
 
				+    assert node2.store('key2', 456, expiration_time=hivemind.get_dht_time() + 32)
			
 
				+    assert node1.get('key1', latest=True).value == 123
			
 
				+    assert node1.get('key2').value == 456
			
 
				 
			
 
				-    that_guys_expert, that_guys_port = "my_other_expert.1337", random.randint(1000, 9999)
			
 
				-    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}')
			
 
				-    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
			
 
				-    assert isinstance(you_found, hivemind.RemoteExpert)
			
 
				-    assert you_found.endpoint == f'that_host:{that_guys_port}'
			
 
				+    assert node1.store('key2', subkey='subkey1', value=789, expiration_time=hivemind.get_dht_time() + 32)
			
 
				+    assert node2.store('key2', subkey='subkey2', value='pew', expiration_time=hivemind.get_dht_time() + 32)
			
 
				+    found_dict = node1.get('key2', latest=True).value
			
 
				+    assert isinstance(found_dict, dict) and len(found_dict) == 2
			
 
				+    assert found_dict['subkey1'].value == 789 and found_dict['subkey2'].value == 'pew'
			
 
				 
			
 
				     for peer in peers:
			
 
				         peer.shutdown()
			
 
				 
			
 
				 
			
 
				-@pytest.mark.forked
			
 
				-def test_dht_get_address(addr=LOCALHOST, dummy_endpoint='123.45.67.89:*'):
			
 
				-    node1 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
			
 
				-    node2 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node1.port}"])
			
 
				-    node3 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node2.port}"])
			
 
				-    assert addr in node3.get_visible_address(num_peers=2)
			
 
				+async def dummy_dht_coro(self, node):
			
 
				+    return 'pew'
			
 
				 
			
 
				-    node4 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
			
 
				-    with pytest.raises(ValueError):
			
 
				-        node4.get_visible_address()
			
 
				-    assert node4.get_visible_address(peers=[f'{addr}:{node1.port}']).endswith(addr)
			
 
				 
			
 
				-    node5 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", endpoint=f"{dummy_endpoint}")
			
 
				-    assert node5.get_visible_address() == strip_port(dummy_endpoint)
			
 
				+async def dummy_dht_coro_error(self, node):
			
 
				+    raise ValueError("Oops, i did it again...")
			
 
				 
			
 
				 
			
 
				-@pytest.mark.forked
			
 
				-def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=16,
			
 
				-                     grid_dims=(32, 32, 32)):
			
 
				-    dht = []
			
 
				-    for i in range(dht_size):
			
 
				-        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
			
 
				-        dht.append(hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc))
			
 
				-
			
 
				-    real_experts = sorted({
			
 
				-        'expert.' + '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims])
			
 
				-        for _ in range(total_experts)
			
 
				-    })
			
 
				-    for batch_start in range(0, len(real_experts), batch_size):
			
 
				-        random.choice(dht).declare_experts(
			
 
				-            real_experts[batch_start: batch_start + batch_size], wait=True,
			
 
				-            endpoint=f"host{batch_start // batch_size}:{random.randint(0, 65536)}")
			
 
				-
			
 
				-    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
			
 
				-    you = hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc)
			
 
				-
			
 
				-    for i in range(50):
			
 
				-        topk_experts = you.find_best_experts('expert.', [np.random.randn(dim) for dim in grid_dims], beam_size=beam_size)
			
 
				-        assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts)
			
 
				-        assert len(topk_experts) == beam_size
			
 
				+async def dummy_dht_coro_stateful(self, node):
			
 
				+    self._x_dummy = getattr(self, '_x_dummy', 123) + 1
			
 
				+    return self._x_dummy
			
 
				 
			
 
				-    for i in range(10):
			
 
				-        batch_experts = you.batch_find_best_experts('expert.', [np.random.randn(batch_size, dim) for dim in grid_dims],
			
 
				-                                                    beam_size=beam_size)
			
 
				-        assert isinstance(batch_experts, list) and len(batch_experts) == batch_size
			
 
				-        assert all(isinstance(e, hivemind.RemoteExpert) for experts in batch_experts for e in experts)
			
 
				-        assert all(len(experts) == beam_size for experts in batch_experts)
			
 
				 
			
 
				+async def dummy_dht_coro_long(self, node):
			
 
				+    await asyncio.sleep(0.25)
			
 
				+    return self._x_dummy ** 2
			
 
				 
			
 
				-@pytest.mark.forked
			
 
				-def test_dht_single_node():
			
 
				-    node = hivemind.DHT(start=True, expiration=999)
			
 
				-
			
 
				-    assert all(node.declare_experts(['expert.1', 'expert.2', 'expert.3'], f"{hivemind.LOCALHOST}:1337").values())
			
 
				-    assert len(node.declare_experts(["ffn.1", "ffn.2"], endpoint="that_place")) == 4
			
 
				-    assert len(node.declare_experts(['e.1.2.3', 'e.1.2.5', 'e.2.0'], f"{hivemind.LOCALHOST}:42")) == 7
			
 
				-
			
 
				-    for expert in node.get_experts(['expert.3', 'expert.2']):
			
 
				-        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				-
			
 
				-    assert all(node.declare_experts(['expert.5', 'expert.2'], f"{hivemind.LOCALHOST}:1337").values())
			
 
				-    found_experts = node.find_best_experts('expert.', [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
			
 
				-    assert len(found_experts) == 2 and [expert.uid for expert in found_experts] == ['expert.5', 'expert.3']
			
 
				-
			
 
				-    successors = node.get_active_successors(['e.1.2.', 'e.2.', 'e.4.5.'])
			
 
				-    assert len(successors['e.1.2.']) == 2
			
 
				-    assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42')
			
 
				-    assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42')
			
 
				-    assert len(successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint('e.2.0', f'{LOCALHOST}:42')
			
 
				-    assert successors['e.4.5.'] == {}
			
 
				-
			
 
				-    initial_beam = node.get_initial_beam('expert.', (3, 2, 1, 0, -1, -2, -3), beam_size=3)
			
 
				-    assert len(initial_beam) == 3
			
 
				-    assert initial_beam[0][:2] == (2.0, 'expert.1.')
			
 
				-    assert initial_beam[1][:2] == (1.0, 'expert.2.')
			
 
				-    assert initial_beam[2][:2] == (0.0, 'expert.3.')
			
 
				-
			
 
				-    with pytest.raises(AssertionError):
			
 
				-        node.find_best_experts('expert', [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
			
 
				-
			
 
				-    with pytest.raises(AssertionError):
			
 
				-        node.find_best_experts('expert.1', [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
			
 
				-
			
 
				-    with pytest.raises(AssertionError):
			
 
				-        node.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])
			
 
				-
			
 
				-    with pytest.raises(AssertionError):
			
 
				-        node.get_initial_beam('expert', (3, 2, 1, 0, -1, -2, -3), beam_size=3)
			
 
				-
			
 
				-
			
 
				-def test_uid_patterns():
			
 
				-    valid_experts = ["expert.1", "expert.0", "expert.0.0.1", "expert.1337", "ffn.12.34.56.78.90",
			
 
				-                     "transformer.3.2.1.0", "transformer_encoder.2", "transformer::encoder.2", "T®@nsf0rmE®🤗.321",
			
 
				-                     "🤗.321", "0.1.2", "00.1.2", "7070.3.2.1.0", "block2.1.23", "LAYER.1.0.1"]
			
 
				-    valid_prefixes = ["expert.", "e.1.", "e.2.", "e.1.2.3.", "ololo.123.456.789.10."]
			
 
				-    valid_prefixes.extend([f"{uid}." for uid in valid_experts])
			
 
				-    valid_prefixes.extend([hivemind.split_uid(uid)[0] for uid in valid_experts])
			
 
				-    for uid in valid_experts:
			
 
				-        assert hivemind.is_valid_uid(uid), f"UID {uid} is valid, but was perceived as invalid"
			
 
				-    for pfx in valid_prefixes:
			
 
				-        assert hivemind.is_valid_prefix(pfx), f"Prefix {pfx} is valid, but was perceived as invalid"
			
 
				-
			
 
				-    invalid = ["", ".", "expert.-1", "xxx.a", "expert.1x", "expert_ffn.1.abc1", "some.123.01", "expert.123.01",
			
 
				-               "e1", "e..1", "e", "e.1.2.3..4", "ffn.1..1", ".123", ".1.2.3.", ".expert", "transformer.encoder.2",
			
 
				-               "T®@nsf0rmE®.🤗.321", "layer::123", "expert.0.1.2.suffix", "0.1.2.suffix", "expert.1 something",
			
 
				-               "expert.1\n", "expert.1\n2", "expert.1 ", "expert.1\nexpert.2", "'expert.1'", '"expert.1"']
			
 
				-    invalid_experts = invalid + valid_prefixes + ["0", "123456"]
			
 
				-    invalid_prefixes = invalid + valid_experts + ["expert", ".🤗", ".expert"]
			
 
				-    for uid in invalid_experts:
			
 
				-        assert not hivemind.is_valid_uid(uid), f"UID {uid} is not valid, but was perceived as valid"
			
 
				-    for pfx in invalid_prefixes:
			
 
				-        assert not hivemind.is_valid_prefix(pfx), f"Prefix {pfx} is not valid, but was perceived as valid"
			
 
				+
			
 
				+async def dummy_dht_coro_for_cancel(self, node):
			
 
				+    self._x_dummy = -100
			
 
				+    await asyncio.sleep(0.5)
			
 
				+    self._x_dummy = 999
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-@pytest.mark.asyncio
			
 
				-async def test_negative_caching():
			
 
				-    peers = []
			
 
				-    for i in range(10):
			
 
				-        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				-        peers.append(hivemind.DHT(initial_peers=neighbors_i, negative_caching=False, cache_locally=False, start=True))
			
 
				+def test_run_coroutine():
			
 
				+    dht = hivemind.DHT(start=True)
			
 
				+    assert dht.run_coroutine(dummy_dht_coro) == 'pew'
			
 
				+
			
 
				+    with pytest.raises(ValueError):
			
 
				+        res = dht.run_coroutine(dummy_dht_coro_error)
			
 
				+
			
 
				+    bg_task = dht.run_coroutine(dummy_dht_coro_long, return_future=True)
			
 
				+    assert dht.run_coroutine(dummy_dht_coro_stateful) == 124
			
 
				+    assert dht.run_coroutine(dummy_dht_coro_stateful) == 125
			
 
				+    assert dht.run_coroutine(dummy_dht_coro_stateful) == 126
			
 
				+    assert not hasattr(dht, '_x_dummy')
			
 
				+    assert bg_task.result() == 126 ** 2
			
 
				+
			
 
				+    future = dht.run_coroutine(dummy_dht_coro_for_cancel, return_future=True)
			
 
				+    time.sleep(0.25)
			
 
				+    future.cancel()
			
 
				+    assert dht.run_coroutine(dummy_dht_coro_stateful) == -99
			
 
				 
			
 
				-    normal_peer, writer_peer = random.sample(peers, 2)
			
 
				 
			
 
				-    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				-    neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, negative_caching=True, cache_locally=False, start=True)
			
 
				+@pytest.mark.forked
			
 
				+def test_dht_get_address(addr=LOCALHOST, dummy_endpoint='123.45.67.89:*'):
			
 
				+    node1 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
			
 
				+    node2 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node1.port}"])
			
 
				+    node3 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node2.port}"])
			
 
				+    assert addr in node3.get_visible_address(num_peers=2)
			
 
				 
			
 
				-    assert all(writer_peer.declare_experts(['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values())
			
 
				-    # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
			
 
				-    assert len(neg_caching_peer.get_initial_beam(prefix='ffn.', scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2
			
 
				+    node4 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
			
 
				+    with pytest.raises(ValueError):
			
 
				+        node4.get_visible_address()
			
 
				+    assert node4.get_visible_address(peers=[f'{addr}:{node1.port}']).endswith(addr)
			
 
				 
			
 
				-    node = await hivemind.DHTNode.create(initial_peers=neighbors_i)
			
 
				-    fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10)))
			
 
				-    for i in range(6):
			
 
				-        assert fetched[i] is not None, f"node should have cached ffn.{i}."
			
 
				-    for i in range(6, len(fetched)):
			
 
				-        assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
			
 
				+    node5 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", endpoint=f"{dummy_endpoint}")
			
 
				+    assert node5.get_visible_address() == strip_port(dummy_endpoint)
			
--- a/tests/test_dht_experts.py
+++ b/tests/test_dht_experts.py
@@ -0,0 +1,159 @@
 
				+import asyncio
			
 
				+import random
			
 
				+
			
 
				+import numpy as np
			
 
				+import pytest
			
 
				+
			
 
				+import hivemind
			
 
				+import hivemind.server.expert_uid
			
 
				+from hivemind import LOCALHOST
			
 
				+from hivemind.client.beam_search import MoEBeamSearcher
			
 
				+from hivemind.server.expert_uid import UidEndpoint, is_valid_uid, is_valid_prefix, split_uid
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_store_get_experts():
			
 
				+    peers = [hivemind.DHT(start=True)]
			
 
				+    for i in range(10):
			
 
				+        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				+        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))
			
 
				+
			
 
				+    first_peer = random.choice(peers)
			
 
				+    other_peer = random.choice(peers)
			
 
				+
			
 
				+    expert_uids = [f"my_expert.{i}" for i in range(110)]
			
 
				+    batch_size = 10
			
 
				+    for batch_start in range(0, len(expert_uids), batch_size):
			
 
				+        hivemind.declare_experts(first_peer, expert_uids[batch_start: batch_start + batch_size], 'localhost:1234')
			
 
				+
			
 
				+    found = other_peer.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
			
 
				+    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
			
 
				+    assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				+
			
 
				+    other_expert, other_port = "my_other_expert.1337", random.randint(1000, 9999)
			
 
				+    hivemind.declare_experts(other_peer, [other_expert], f'that_host:{other_port}')
			
 
				+    first_notfound, first_found = hivemind.get_experts(first_peer, ['foobar', other_expert])
			
 
				+    assert isinstance(first_found, hivemind.RemoteExpert)
			
 
				+    assert first_found.endpoint == f'that_host:{other_port}'
			
 
				+
			
 
				+    for peer in peers:
			
 
				+        peer.shutdown()
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=16,
			
 
				+                     grid_dims=(32, 32, 32)):
			
 
				+    dht = []
			
 
				+    for i in range(dht_size):
			
 
				+        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
			
 
				+        dht.append(hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc))
			
 
				+
			
 
				+    real_experts = sorted({
			
 
				+        'expert.' + '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims])
			
 
				+        for _ in range(total_experts)
			
 
				+    })
			
 
				+    for batch_start in range(0, len(real_experts), batch_size):
			
 
				+        random.choice(dht).declare_experts(
			
 
				+            real_experts[batch_start: batch_start + batch_size], wait=True,
			
 
				+            endpoint=f"host{batch_start // batch_size}:{random.randint(0, 65536)}")
			
 
				+
			
 
				+    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
			
 
				+    you = hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc)
			
 
				+    beam_search = MoEBeamSearcher(you, 'expert.', grid_dims)
			
 
				+
			
 
				+    for i in range(50):
			
 
				+        topk_experts = beam_search.find_best_experts([np.random.randn(dim) for dim in grid_dims], beam_size)
			
 
				+        assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts)
			
 
				+        assert len(topk_experts) == beam_size
			
 
				+
			
 
				+    for i in range(10):
			
 
				+        batch_experts = beam_search.batch_find_best_experts([np.random.randn(batch_size, dim) for dim in grid_dims],
			
 
				+                                                            beam_size=beam_size)
			
 
				+        assert isinstance(batch_experts, list) and len(batch_experts) == batch_size
			
 
				+        assert all(isinstance(e, hivemind.RemoteExpert) for experts in batch_experts for e in experts)
			
 
				+        assert all(len(experts) == beam_size for experts in batch_experts)
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_dht_single_node():
			
 
				+    node = hivemind.DHT(start=True, expiration=999)
			
 
				+    beam_search = MoEBeamSearcher(node, 'expert.')
			
 
				+
			
 
				+    assert all(node.declare_experts(['expert.1', 'expert.2', 'expert.3'], f"{hivemind.LOCALHOST}:1337").values())
			
 
				+    assert len(node.declare_experts(["ffn.1", "ffn.2"], endpoint="that_place")) == 4
			
 
				+    assert len(node.declare_experts(['e.1.2.3', 'e.1.2.5', 'e.2.0'], f"{hivemind.LOCALHOST}:42")) == 7
			
 
				+
			
 
				+    for expert in node.get_experts(['expert.3', 'expert.2']):
			
 
				+        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+
			
 
				+    assert all(node.declare_experts(['expert.5', 'expert.2'], f"{hivemind.LOCALHOST}:1337").values())
			
 
				+    found_experts = beam_search.find_best_experts([(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
			
 
				+    assert len(found_experts) == 2 and [expert.uid for expert in found_experts] == ['expert.5', 'expert.3']
			
 
				+
			
 
				+    successors = beam_search.get_active_successors(['e.1.2.', 'e.2.', 'e.4.5.'])
			
 
				+    assert len(successors['e.1.2.']) == 2
			
 
				+    assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42')
			
 
				+    assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42')
			
 
				+    assert len(successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint('e.2.0', f'{LOCALHOST}:42')
			
 
				+    assert successors['e.4.5.'] == {}
			
 
				+
			
 
				+    initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3), beam_size=3)
			
 
				+    assert len(initial_beam) == 3
			
 
				+    assert initial_beam[0][:2] == (2.0, 'expert.1.')
			
 
				+    assert initial_beam[1][:2] == (1.0, 'expert.2.')
			
 
				+    assert initial_beam[2][:2] == (0.0, 'expert.3.')
			
 
				+
			
 
				+    with pytest.raises(AssertionError):
			
 
				+        beam_search = MoEBeamSearcher(node, 'expert.1.ffn')
			
 
				+
			
 
				+    with pytest.raises(AssertionError):
			
 
				+        beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])
			
 
				+
			
 
				+
			
 
				+def test_uid_patterns():
			
 
				+    valid_experts = ["expert.1", "expert.0", "expert.0.0.1", "expert.1337", "ffn.12.34.56.78.90",
			
 
				+                     "transformer.3.2.1.0", "transformer_encoder.2", "transformer::encoder.2", "T®@nsf0rmE®🤗.321",
			
 
				+                     "🤗.321", "0.1.2", "00.1.2", "7070.3.2.1.0", "block2.1.23", "LAYER.1.0.1"]
			
 
				+    valid_prefixes = ["expert.", "e.1.", "e.2.", "e.1.2.3.", "ololo.123.456.789.10."]
			
 
				+    valid_prefixes.extend([f"{uid}." for uid in valid_experts])
			
 
				+    valid_prefixes.extend([split_uid(uid)[0] for uid in valid_experts])
			
 
				+    for uid in valid_experts:
			
 
				+        assert is_valid_uid(uid), f"UID {uid} is valid, but was perceived as invalid"
			
 
				+    for pfx in valid_prefixes:
			
 
				+        assert is_valid_prefix(pfx), f"Prefix {pfx} is valid, but was perceived as invalid"
			
 
				+
			
 
				+    invalid = ["", ".", "expert.-1", "xxx.a", "expert.1x", "expert_ffn.1.abc1", "some.123.01", "expert.123.01",
			
 
				+               "e1", "e..1", "e", "e.1.2.3..4", "ffn.1..1", ".123", ".1.2.3.", ".expert", "transformer.encoder.2",
			
 
				+               "T®@nsf0rmE®.🤗.321", "layer::123", "expert.0.1.2.suffix", "0.1.2.suffix", "expert.1 something",
			
 
				+               "expert.1\n", "expert.1\n2", "expert.1 ", "expert.1\nexpert.2", "'expert.1'", '"expert.1"']
			
 
				+    invalid_experts = invalid + valid_prefixes + ["0", "123456"]
			
 
				+    invalid_prefixes = invalid + valid_experts + ["expert", ".🤗", ".expert"]
			
 
				+    for uid in invalid_experts:
			
 
				+        assert not is_valid_uid(uid), f"UID {uid} is not valid, but was perceived as valid"
			
 
				+    for pfx in invalid_prefixes:
			
 
				+        assert not is_valid_prefix(pfx), f"Prefix {pfx} is not valid, but was perceived as valid"
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+@pytest.mark.asyncio
			
 
				+async def test_negative_caching():
			
 
				+    peers = []
			
 
				+    for i in range(10):
			
 
				+        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				+        peers.append(hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True))
			
 
				+
			
 
				+    writer_peer = random.choice(peers)
			
 
				+    assert all(hivemind.declare_experts(writer_peer, ['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values())
			
 
				+
			
 
				+    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				+    neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True)
			
 
				+    beam_search = MoEBeamSearcher(neg_caching_peer, uid_prefix='ffn.', negative_caching=True)
			
 
				+    # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
			
 
				+    assert len(beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2
			
 
				+
			
 
				+    node = await hivemind.DHTNode.create(initial_peers=neighbors_i)
			
 
				+    fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10)))
			
 
				+    for i in range(6):
			
 
				+        assert fetched[i] is not None, f"node should have cached ffn.{i}."
			
 
				+    for i in range(6, len(fetched)):
			
 
				+        assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
			
--- a/tests/test_dht_node.py
+++ b/tests/test_dht_node.py
@@ -1,15 +1,14 @@
 
				 import asyncio
			
 
				+import heapq
			
 
				 import multiprocessing as mp
			
 
				 import random
			
 
				-import heapq
			
 
				-from typing import Optional
			
 
				+from itertools import product
			
 
				+from typing import Optional, List, Dict
			
 
				+
			
 
				 import numpy as np
			
 
				 import pytest
			
 
				-from itertools import product
			
 
				 
			
 
				 import hivemind
			
 
				-from typing import List, Dict
			
 
				-
			
 
				 from hivemind import get_dht_time, replace_port
			
 
				 from hivemind.dht.node import DHTID, Endpoint, DHTNode, LOCALHOST
			
 
				 from hivemind.dht.protocol import DHTProtocol, ValidationError
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -109,10 +109,10 @@ def test_beam_search_correctness():
 
				 
			
 
				     for i in range(25):
			
 
				         input = torch.randn(32)
			
 
				-        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)
			
 
				+        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.beam_search.grid_size, dim=-1)
			
 
				 
			
 
				-        chosen_experts = dht.find_best_experts(dmoe.uid_prefix, [tensor.detach().numpy() for tensor in grid_scores],
			
 
				-                                               beam_size=dmoe.k_best)
			
 
				+        chosen_experts = dmoe.beam_search.find_best_experts([tensor.detach().numpy() for tensor in grid_scores],
			
 
				+                                                            beam_size=dmoe.k_best)
			
 
				         chosen_scores = dmoe.compute_expert_scores([dim_scores[None] for dim_scores in grid_scores],
			
 
				                                                    [chosen_experts])[0]
			
 
				         our_best_scores = list(chosen_scores.cpu().detach().numpy())