пре 5 година · 6d2b8094c9
--- a/hivemind/client/expert.py
+++ b/hivemind/client/expert.py
@@ -1,5 +1,6 @@
 
				 import pickle
			
 
				-from typing import Tuple, Optional
			
 
				+from functools import lru_cache
			
 
				+from typing import Tuple, Optional, Any
			
 
				 
			
 
				 import grpc
			
 
				 import grpc.experimental.aio
			
@@ -13,6 +14,19 @@ from hivemind.utils.grpc import serialize_torch_tensor, deserialize_torch_tensor
 
				 DUMMY = torch.empty(0, requires_grad=True)  # dummy tensor that triggers autograd in RemoteExpert
			
 
				 
			
 
				 
			
 
				+@lru_cache(maxsize=None)
			
 
				+def _get_expert_stub(endpoint: Endpoint, aio: bool, *extra_options: Tuple[str, Any]):
			
 
				+    """ Create a gRPC stub to access remote expert or use previously created stub from a process-wide cache """
			
 
				+    channel_options = [
			
 
				+        ('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)
			
 
				+    ] + list(extra_options)
			
 
				+    if aio:
			
 
				+        channel = grpc.experimental.aio.insecure_channel(endpoint, options=channel_options)
			
 
				+    else:
			
 
				+        channel = grpc.insecure_channel(endpoint, options=channel_options)
			
 
				+    return runtime_grpc.ConnectionHandlerStub(channel)
			
 
				+
			
 
				+
			
 
				 class RemoteExpert(nn.Module):
			
 
				     """
			
 
				     A simple module that runs forward/backward of an expert hosted on a remote machine.
			
@@ -28,22 +42,11 @@ class RemoteExpert(nn.Module):
 
				     def __init__(self, uid, endpoint: Endpoint):
			
 
				         super().__init__()
			
 
				         self.uid, self.endpoint = uid, endpoint
			
 
				-        self._channel, self._stub, self._info = None, None, None
			
 
				+        self._info = None
			
 
				 
			
 
				     @property
			
 
				     def stub(self):
			
 
				-        if self._channel is None:
			
 
				-            self._channel = grpc.insecure_channel(self.endpoint, options=[
			
 
				-                ('grpc.max_send_message_length', -1),
			
 
				-                ('grpc.max_receive_message_length', -1)
			
 
				-            ])
			
 
				-        if self._stub is None:
			
 
				-            self._stub = runtime_grpc.ConnectionHandlerStub(self._channel)
			
 
				-        return self._stub
			
 
				-
			
 
				-    def __del__(self):
			
 
				-        if self._channel is not None:
			
 
				-            self._channel.close()
			
 
				+        return _get_expert_stub(self.endpoint, aio=False)
			
 
				 
			
 
				     def forward(self, *args, **kwargs):
			
 
				         """ Call RemoteExpert for the specified inputs and return its output(s). Compatible with pytorch.autograd. """
			
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -1,14 +1,20 @@
 
				-from functools import partial
			
 
				-from typing import Tuple, List, Optional
			
 
				+from __future__ import annotations
			
 
				+import time
			
 
				+import asyncio
			
 
				+from typing import Tuple, List, Optional, Awaitable, Set, Dict
			
 
				 
			
 
				-import numpy as np
			
 
				 import torch
			
 
				 import torch.nn as nn
			
 
				 from torch.autograd.function import once_differentiable
			
 
				+import grpc.experimental.aio
			
 
				 
			
 
				-from hivemind.client.expert import RemoteExpert, _RemoteModuleCall, DUMMY
			
 
				-from hivemind.utils import nested_map, run_and_await_k, nested_pack, nested_flatten, run_in_background, \
			
 
				-    run_isolated_forward, EmulatedAutogradContext, run_isolated_backward, map_with_parallel_backward
			
 
				+import hivemind
			
 
				+from hivemind.client.expert import RemoteExpert, DUMMY, _get_expert_stub
			
 
				+from hivemind.utils import nested_map, nested_pack, nested_flatten, runtime_grpc, runtime_pb2, \
			
 
				+    serialize_torch_tensor, deserialize_torch_tensor
			
 
				+from hivemind.utils.logging import get_logger
			
 
				+
			
 
				+logger = get_logger(__name__)
			
 
				 
			
 
				 
			
 
				 class RemoteMixtureOfExperts(nn.Module):
			
@@ -25,30 +31,31 @@ class RemoteMixtureOfExperts(nn.Module):
 
				     :param uid_prefix: common prefix for all expert uids
			
 
				      expert uid follows the pattern {uid_prefix}.{0...grid_size[0]}.{0...grid_size[1]}...{0...grid_size[-1]}
			
 
				     :param dht: DHT where the experts reside
			
 
				-    :param num_workers: number of threads for parallel dht operation
			
 
				     :param k_best: queries this many experts with highest scores
			
 
				     :param k_min: makes sure at least this many experts returned output
			
 
				     :param timeout_after_k_min: waits for this many seconds after k_min experts returned results.
			
 
				      Any expert that didn't manage to return output after that delay is considered unavailable
			
 
				-    :param expert_padding: internal value used to denote "absent expert". Should not coincide with any expert uid.
			
 
				     :param allow_broadcasting: if RemoteMixtureOfExperts if fed with input dimension above 2,
			
 
				      allow_broadcasting=True will flatten first d-1 input dimensions, apply RemoteMixtureOfExperts and un-flatten again
			
 
				      allow_broadcasting=False will raise an error
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, *, in_features, grid_size: Tuple[int], dht, k_best, k_min=1,
			
 
				-                 forward_timeout=None, timeout_after_k_min=1.0, backward_k_min=1, backward_timeout=None,
			
 
				-                 uid_prefix='', expert_padding=None, allow_broadcasting=True):
			
 
				+    def __init__(self, *, in_features, grid_size: Tuple[int, ...], dht: hivemind.DHT, k_best: int, k_min: int = 1,
			
 
				+                 forward_timeout: Optional[float] = None, timeout_after_k_min: Optional[float] = None,
			
 
				+                 backward_k_min: int = 1, backward_timeout: Optional[float] = None, uid_prefix='',
			
 
				+                 allow_broadcasting=True, loop: asyncio.BaseEventLoop = None):
			
 
				         super().__init__()
			
 
				-        self.dht, self.grid_size = dht, grid_size
			
 
				-        self.uid_prefix, self.expert_padding = uid_prefix, expert_padding
			
 
				+        self.dht, self.grid_size, self.uid_prefix = dht, grid_size, uid_prefix
			
 
				+        self.loop = loop or asyncio.new_event_loop()
			
 
				+        assert not self.loop.is_running(), "Event loop is already running. If in jupyter, please apply nest_asyncio " \
			
 
				+            "(pip install nest_asyncio , https://pypi.org/project/nest-asyncio ) and send loop=asyncio.new_event_loop()"
			
 
				         self.k_best, self.k_min, self.backward_k_min = k_best, k_min, backward_k_min
			
 
				         self.forward_timeout, self.backward_timeout = forward_timeout, backward_timeout
			
 
				         self.timeout_after_k_min = timeout_after_k_min
			
 
				         self.allow_broadcasting = allow_broadcasting
			
 
				 
			
 
				         self.proj = nn.Linear(in_features, sum(grid_size))  # jointly predict logits for all grid dimensions
			
 
				-        self._outputs_schema = None
			
 
				+        self._outputs_schema = None  # expert['info'][outputs_schema] from one of experts in the grid
			
 
				 
			
 
				     def forward(self, input: torch.Tensor, *args: torch.Tensor, **kwargs: torch.Tensor):
			
 
				         """
			
@@ -69,25 +76,31 @@ class RemoteMixtureOfExperts(nn.Module):
 
				 
			
 
				         # 1. compute scores and find most appropriate experts with beam search
			
 
				         grid_scores = self.proj(input).split_with_sizes(self.grid_size, dim=-1)
			
 
				-        chosen_experts = self.beam_search(grid_scores, self.k_best)
			
 
				-        # ^-- List[batch_size] of List[RemoteExpert] chosen for every input in batch
			
 
				 
			
 
				-        expert_logits = self.compute_expert_scores(grid_scores, chosen_experts)
			
 
				+        async def _search():
			
 
				+            coroutines = [asyncio.create_task(self.beam_search(
			
 
				+                [dim_scores[i] for dim_scores in grid_scores], self.k_best))
			
 
				+                for i in range(len(input))]
			
 
				+            return list(await asyncio.gather(*coroutines))
			
 
				 
			
 
				-        expert_inputs = ((input, *args), kwargs)
			
 
				-        input_schema = nested_map(lambda x: None, expert_inputs)
			
 
				-        flat_inputs_per_expert = tuple(zip(*[tensor.split(1, dim=0) for tensor in nested_flatten(expert_inputs)]))
			
 
				+        chosen_experts: List[List[RemoteExpert]] = self.loop.run_until_complete(_search())
			
 
				+        # ^-- List[batch_size] of List[RemoteExpert] chosen for every input in batch
			
 
				 
			
 
				-        batch_jobs_args = tuple(
			
 
				-            (expert_logits[i, :len(chosen_experts[i])], chosen_experts[i], self.k_min, self.timeout_after_k_min,
			
 
				-             self.backward_k_min, self.forward_timeout, self.backward_timeout, input_schema, *flat_inputs_per_expert[i])
			
 
				-            for i in range(len(input))
			
 
				-        )
			
 
				+        expert_mask, *expert_outputs = _RemoteCallMany.apply(
			
 
				+            DUMMY, chosen_experts, self.k_min, self.backward_k_min, self.timeout_after_k_min,
			
 
				+            self.forward_timeout, self.backward_timeout, self.loop, *nested_flatten(((input, *args), kwargs)))
			
 
				+        # ^-- multiple tensors of shape [batch_size, max_experts, ...output_shape]
			
 
				 
			
 
				-        averaged_outputs_flat = map(torch.cat, zip(*map_with_parallel_backward(_RemoteMoECall, *batch_jobs_args)))
			
 
				+        expert_logits = self.compute_expert_scores(grid_scores, chosen_experts)
			
 
				+        masked_logits = torch.full((1,), float('-inf'), device=expert_logits.device, dtype=expert_logits.dtype)
			
 
				+        expert_logits = torch.where(expert_mask, expert_logits, masked_logits)
			
 
				+        expert_weights = torch.softmax(expert_logits, dim=1)
			
 
				+        averaged_outputs_flat = [
			
 
				+            (expert_weights[..., None] * tensor.flatten(start_dim=2)).view(tensor.shape).sum(dim=1)
			
 
				+            for tensor in expert_outputs]  # ^-- multiply by softmax weights along first 2 axes
			
 
				         return nested_pack(averaged_outputs_flat, self.outputs_schema)
			
 
				 
			
 
				-    def beam_search(self, grid_scores: List[torch.Tensor], k_best: int, **kwargs) -> List[List[RemoteExpert]]:
			
 
				+    async def beam_search(self, grid_scores: List[torch.Tensor], k_best: int, **kwargs) -> List[RemoteExpert]:
			
 
				         """
			
 
				         Find and return k best experts in the grid using (exact) beam search of the product space
			
 
				 
			
@@ -99,51 +112,39 @@ class RemoteMixtureOfExperts(nn.Module):
 
				          RemoteExpert instances for *up to* k_best experts
			
 
				         """
			
 
				         assert len(grid_scores) == len(self.grid_size)
			
 
				-        assert all(len(dim_scores.shape) == 2 for dim_scores in grid_scores)
			
 
				-        batch_size = len(grid_scores[0])
			
 
				-        beam = np.array([[self.uid_prefix]] * batch_size, dtype=object)  # [batch_size, up_to_beam_size]
			
 
				-        scores = np.zeros([batch_size, 1], dtype=np.float64)
			
 
				+        assert all(dim_scores.shape == (self.grid_size[dim_index],) for dim_index, dim_scores in enumerate(grid_scores))
			
 
				+        grid_scores = [dim_scores.cpu().detach() for dim_scores in grid_scores]
			
 
				 
			
 
				-        delimiters = np.array(self.dht.UID_DELIMITER)[None, None, None]  # pre-compute numpy array for fast concat
			
 
				+        beam_experts: List[RemoteExpert] = []
			
 
				+        beam: List[str] = [self.uid_prefix]
			
 
				+        beam_scores = torch.zeros(1)
			
 
				 
			
 
				         for dim_index, dim_scores in enumerate(grid_scores):
			
 
				-            dim_scores = dim_scores.detach().cpu().numpy()
			
 
				-            assert dim_scores.shape[-1] == self.grid_size[dim_index]
			
 
				-
			
 
				-            # create all possible successsors from current beam
			
 
				-            dim_indices = np.arange(dim_scores.shape[1]).astype(str)
			
 
				-            new_candidates = beam[:, :, None] + delimiters + dim_indices[None, None, :]
			
 
				-            new_candidates = new_candidates.reshape([batch_size, -1])
			
 
				+            # create all possible successors from current beam and sort them by total score
			
 
				+            expanded_scores = beam_scores[:, None] + dim_scores[None, :]
			
 
				+            sorted_indices = [(flat_i // len(dim_scores), flat_i % len(dim_scores))
			
 
				+                              for flat_i in (-expanded_scores).flatten().argsort().numpy()]
			
 
				 
			
 
				-            new_scores = scores[:, :, None] + dim_scores[:, None, :]
			
 
				-            new_scores = new_scores.reshape([batch_size, -1])
			
 
				+            sorted_candidates = [f"{beam[row]}{self.dht.UID_DELIMITER}{col}" for row, col in sorted_indices]
			
 
				+            candidate_to_indices = dict(zip(sorted_candidates, sorted_indices))
			
 
				 
			
 
				             # select k best candidates according to scores but only those that are still active
			
 
				-            new_order = np.argsort(- new_scores, axis=-1)
			
 
				-            top_alive_lookups = [
			
 
				-                run_in_background(self.dht.first_k_active, cands[order], k_best, **kwargs)
			
 
				-                for cands, order in zip(new_candidates, new_order)]
			
 
				-
			
 
				-            batch_cand_to_score = [
			
 
				-                dict(zip(cands, cand_scores)) for cands, cand_scores in zip(new_candidates, new_scores)]
			
 
				-
			
 
				-            top_alive_prefixes = [result.result() for result in top_alive_lookups]
			
 
				-            top_alive_scores = [list(map(cand_to_score.get, top_cands))
			
 
				-                                for cand_to_score, top_cands in zip(batch_cand_to_score, top_alive_prefixes)]
			
 
				-
			
 
				-            # pad up to beam size
			
 
				-            beam = np.array([row + [self.expert_padding] * (k_best - len(row))
			
 
				-                             for row in top_alive_prefixes], dtype='object')
			
 
				-            scores = np.array([row + [-float('inf')] * (k_best - len(row))
			
 
				-                               for row in top_alive_scores], dtype='float32')
			
 
				-
			
 
				-        unique_experts = self.dht.get_experts(list(set(
			
 
				-            uid for row in beam for uid in row if uid != self.expert_padding)))
			
 
				+            best_alive_prefixes: Dict[str, RemoteExpert] = await self.dht.first_k_active(
			
 
				+                uid_prefixes=sorted_candidates, k=k_best, return_future=True, **kwargs)
			
 
				+            if not best_alive_prefixes:
			
 
				+                logger.warning(f"Grid is empty: found neither of {sorted_candidates}")
			
 
				+                break
			
 
				+            beam = list(best_alive_prefixes.keys())
			
 
				+            beam_scores = expanded_scores[tuple(zip(*map(candidate_to_indices.get, beam)))]
			
 
				+            beam_experts = list(best_alive_prefixes.values())
			
 
				+
			
 
				         if self._outputs_schema is None:
			
 
				-            self._outputs_schema = next(iter(unique_experts)).info['outputs_schema']
			
 
				-        unique_experts_by_uid = {expert.uid: expert for expert in unique_experts if expert != self.expert_padding}
			
 
				+            try:
			
 
				+                self._outputs_schema = beam_experts[0].info['outputs_schema']
			
 
				+            except grpc.RpcError as e:
			
 
				+                logger.warning(f"Failed to get RemoteMixtureOfExperts.output_shape: {e}")
			
 
				 
			
 
				-        return [[unique_experts_by_uid[uid] for uid in row if uid in unique_experts_by_uid] for row in beam]
			
 
				+        return beam_experts
			
 
				 
			
 
				     def compute_expert_scores(
			
 
				             self, grid_scores: List[torch.Tensor], batch_experts: List[List[RemoteExpert]]) -> torch.Tensor:
			
@@ -164,11 +165,11 @@ class RemoteMixtureOfExperts(nn.Module):
 
				         flat_local_indices = expert_index_in_batch - expert_strides[flat_batch_indices]
			
 
				         flat_experts = [expert for row in batch_experts for expert in row]
			
 
				 
			
 
				-        grid_indices = np.zeros([len(flat_experts), len(grid_scores)], dtype=np.int64)
			
 
				+        grid_indices = torch.zeros([len(flat_experts), len(grid_scores)], dtype=torch.int64)
			
 
				         for i, expert in enumerate(flat_experts):
			
 
				             expert_indices = expert.uid[len(self.uid_prefix) + len(self.dht.UID_DELIMITER):]
			
 
				             expert_indices = list(map(int, expert_indices.split(self.dht.UID_DELIMITER)))
			
 
				-            grid_indices[i] = expert_indices
			
 
				+            grid_indices[i] = torch.as_tensor(expert_indices, dtype=grid_indices.dtype)
			
 
				 
			
 
				         scores_per_dim = [
			
 
				             dim_scores[flat_batch_indices, dim_indices] if len(flat_batch_indices) else torch.zeros(0)
			
@@ -183,86 +184,156 @@ class RemoteMixtureOfExperts(nn.Module):
 
				     def outputs_schema(self):
			
 
				         if self._outputs_schema is None:
			
 
				             # grab some expert to set ensemble output shape
			
 
				-            dummy_scores = self.proj(torch.randn(1, self.proj.in_features)).split_with_sizes(self.grid_size, dim=-1)
			
 
				-            self._outputs_schema = self.beam_search(dummy_scores, k_best=1)[0][0].info['outputs_schema']
			
 
				+            dummy_scores = self.proj(torch.randn(self.proj.in_features)).cpu().split_with_sizes(self.grid_size, dim=-1)
			
 
				+            dummy_experts = self.loop.run_until_complete(self.beam_search(dummy_scores, k_best=1))
			
 
				+            self._outputs_schema = dummy_experts[0].info['outputs_schema']
			
 
				         return self._outputs_schema
			
 
				 
			
 
				 
			
 
				-class _RemoteMoECall(torch.autograd.Function):
			
 
				+class _RemoteCallMany(torch.autograd.Function):
			
 
				     """
			
 
				-    Internal autograd-friendly function that calls multiple experts on the same input and averages their outputs.
			
 
				-    This function that can recover from individual failures during forward and/or backward passes.
			
 
				-    For user-friendly version of this function, use RemoteMixtureOfExperts module.
			
 
				+    Internal autograd-friendly function that calls multiple experts on a batch of inputs and awaits responses
			
 
				+    This function that can recover from individual failures during forward and/or backward pass as long as at least
			
 
				+    one expert succeeds for each input. For user-friendly version of this function, use RemoteMixtureOfExperts module.
			
 
				+
			
 
				+    Note: experts that failed during forward will be assigned zero outputs and marked as mask[i, j] = 0,
			
 
				+          experts that failed during backward will be treated as constants (i.e. gradients of through them are zeros)
			
 
				     """
			
 
				 
			
 
				     @classmethod
			
 
				-    def forward(cls, ctx, expert_logits: torch.Tensor, experts: List[RemoteExpert],
			
 
				-                k_min: int, timeout_after_k_min: float, backward_k_min: int, timeout_total: Optional[float],
			
 
				-                backward_timeout: Optional[float], input_schema, *flat_inputs: torch.Tensor) -> Tuple[torch.Tensor]:
			
 
				-        expert_args, expert_kwargs = nested_pack(flat_inputs, structure=input_schema)
			
 
				-        assert expert_logits.ndim == 1 and len(expert_logits) == len(experts)
			
 
				-
			
 
				-        # 1. call experts and await results
			
 
				-        jobs = [partial(cls._run_expert_forward, expert, *expert_args, **expert_kwargs) for expert in experts]
			
 
				-        results = run_and_await_k(jobs, k=k_min, timeout_after_k=timeout_after_k_min, timeout_total=timeout_total)
			
 
				-
			
 
				-        alive_contexts, alive_outputs, alive_ix = zip(*[(result[0], result[1], ix) for ix, result in enumerate(results)
			
 
				-                                                        if not isinstance(result, BaseException)])
			
 
				-        #     ^               ^            ^-- a list of indices of experts that returned outputs in time
			
 
				-        #      \               \-- list of outputs of every expert that didn't die on us
			
 
				-        #       \-- a list of autograd contexts, used for parallel backward
			
 
				-
			
 
				-        # 2. compute softmax weights for alive experts and average outputs
			
 
				-        alive_ix = torch.as_tensor(alive_ix, device=expert_logits.device)
			
 
				-        alive_expert_probs = torch.softmax(expert_logits[alive_ix], dim=0)
			
 
				-
			
 
				-        stacked_alive_outputs = tuple(map(torch.stack, zip(*alive_outputs)))
			
 
				-
			
 
				-        flat_average_outputs = tuple((alive_expert_probs @ stacked_out.flatten(1)).view(*stacked_out.shape[1:])
			
 
				-                                     for stacked_out in stacked_alive_outputs)
			
 
				-
			
 
				-        # 3. save individual outputs for backward pass
			
 
				-        ctx.save_for_backward(expert_logits, alive_ix, alive_expert_probs, *stacked_alive_outputs)
			
 
				-        ctx._saved_non_tensors = alive_contexts, backward_k_min, backward_timeout
			
 
				-        return tuple(map(torch.Tensor.detach, flat_average_outputs))
			
 
				+    def forward(cls, ctx, dummy, experts_per_sample: List[List[RemoteExpert]], k_min: int, backward_k_min: int,
			
 
				+                timeout_after_k_min: float, forward_timeout: Optional[float], backward_timeout: Optional[float],
			
 
				+                loop: asyncio.base_events.BaseEventLoop, *flat_inputs: torch.Tensor) -> Tuple[torch.Tensor]:
			
 
				+        assert not torch.is_grad_enabled()
			
 
				+        num_samples, max_experts = len(experts_per_sample), max(map(len, experts_per_sample))
			
 
				+        flat_inputs_per_sample: List[Tuple[torch.Tensor, ...]] = list(zip(*(x.split(1, dim=0) for x in flat_inputs)))
			
 
				+        assert len(experts_per_sample) == len(flat_inputs_per_sample) == num_samples
			
 
				+
			
 
				+        async def _forward():
			
 
				+            # dispatch tasks to all remote experts, await responses
			
 
				+            pending_tasks = {
			
 
				+                asyncio.create_task(cls._forward_one_expert((i, j), expert, flat_inputs_per_sample[i]))
			
 
				+                for i in range(num_samples) for j, expert in enumerate(experts_per_sample[i])
			
 
				+            }
			
 
				+            alive_grid_indices, alive_flat_outputs = await cls._wait_for_responses(
			
 
				+                pending_tasks, num_samples, k_min, forward_timeout, timeout_after_k_min)
			
 
				+
			
 
				+            # assemble responses
			
 
				+            alive_ii, alive_jj = map(torch.as_tensor, zip(*alive_grid_indices))
			
 
				+            mask = torch.zeros([num_samples, max_experts], dtype=torch.bool, device=flat_inputs[0].device)
			
 
				+            mask[alive_ii, alive_jj] = True
			
 
				+
			
 
				+            alive_flat_outputs_stacked = list(map(torch.cat, zip(*alive_flat_outputs)))
			
 
				+            # list of torch tensors, where i-th tensor is of shape [num_responded, *expert_outputs[i].shape]
			
 
				+
			
 
				+            outputs = []
			
 
				+            for response_stacked in alive_flat_outputs_stacked:
			
 
				+                output = torch.zeros(
			
 
				+                    [num_samples, max_experts, *response_stacked.shape[1:]], device=response_stacked.device,
			
 
				+                    dtype=response_stacked.dtype, requires_grad=response_stacked.requires_grad)
			
 
				+                output[alive_ii, alive_jj] = response_stacked
			
 
				+                outputs.append(output)
			
 
				+
			
 
				+            # save individual outputs for backward pass
			
 
				+            ctx.save_for_backward(alive_ii, alive_jj, *flat_inputs)
			
 
				+            ctx._saved_non_tensors = loop, backward_k_min, backward_timeout, timeout_after_k_min, experts_per_sample
			
 
				+            return (mask,) + tuple(outputs)
			
 
				+
			
 
				+        return loop.run_until_complete(_forward())
			
 
				 
			
 
				     @classmethod
			
 
				     @once_differentiable
			
 
				-    def backward(cls, ctx, *grad_outputs_flat: torch.Tensor) -> Tuple[Optional[torch.Tensor], ...]:
			
 
				-        """ Like normal backward, but we ignore any experts that failed during backward pass """
			
 
				-        expert_logits, alive_ix, alive_expert_probas, *stacked_alive_outputs = ctx.saved_tensors
			
 
				-        alive_contexts, backward_k_min, backward_timeout = ctx._saved_non_tensors
			
 
				-
			
 
				-        jobs = [partial(cls._run_expert_backward, ctx, prob, *grad_outputs_flat)
			
 
				-                for ctx, prob in zip(alive_contexts, alive_expert_probas.split(1))]
			
 
				-        results = run_and_await_k(jobs, k=backward_k_min, timeout_after_k=backward_timeout, timeout_total=None)
			
 
				-        backward_survivors_in_alive_ix, survived_grad_inputs = zip(*((i, grads) for i, grads in enumerate(results)))
			
 
				-        backward_survivors_in_alive_ix = torch.as_tensor(backward_survivors_in_alive_ix, device=expert_logits.device)
			
 
				-        backward_survivors_ix = alive_ix[backward_survivors_in_alive_ix]
			
 
				-        survived_probas = torch.softmax(expert_logits[backward_survivors_ix], dim=0)
			
 
				-        weight_ratios = survived_probas / alive_expert_probas[backward_survivors_in_alive_ix]
			
 
				-        flat_grad_inputs = tuple((weight_ratios @ stacked_grad_inp.flatten(1)).view(stacked_grad_inp.shape[1:])
			
 
				-                                 for stacked_grad_inp in map(torch.stack, zip(*survived_grad_inputs)))
			
 
				-
			
 
				-        # compute grad w.r.t. logits
			
 
				-        grad_wrt_probs = sum(tuple(
			
 
				-            torch.sum(grad_out[None, ...] * stacked_avive_out[backward_survivors_in_alive_ix],
			
 
				-                      dim=tuple(range(1, stacked_avive_out.ndim)))
			
 
				-            for grad_out, stacked_avive_out in zip(grad_outputs_flat, stacked_alive_outputs)
			
 
				-        ))
			
 
				-        softmax_jacobian = torch.diagflat(survived_probas) - torch.ger(survived_probas, survived_probas)
			
 
				-        grad_wrt_survived_logits = grad_wrt_probs @ softmax_jacobian
			
 
				-        grad_wrt_logits = torch.zeros_like(expert_logits).scatter(0, backward_survivors_ix, grad_wrt_survived_logits)
			
 
				-
			
 
				-        return (grad_wrt_logits, None, None, None, None, None, None, None, *flat_grad_inputs)
			
 
				+    def backward(cls, ctx, *raw_grads: torch.Tensor) -> Tuple[Optional[torch.Tensor], ...]:
			
 
				+        assert not torch.is_grad_enabled()
			
 
				+        loop, backward_k_min, backward_timeout, timeout_after_k_min, expert_per_sample = ctx._saved_non_tensors
			
 
				+        alive_ii, alive_jj, *flat_inputs = ctx.saved_tensors
			
 
				+        dummy_grad_mask, *flat_grad_outputs = raw_grads
			
 
				+        num_samples, max_experts = dummy_grad_mask.shape
			
 
				+
			
 
				+        inputs_per_expert = zip(*(tensor[alive_ii].split(1, dim=0) for tensor in flat_inputs))
			
 
				+        grad_outputs_per_expert = zip(*(tensor[alive_ii, alive_jj].split(1, dim=0) for tensor in flat_grad_outputs))
			
 
				+
			
 
				+        async def _backward():
			
 
				+            # dispatch tasks to all remote experts, await responses
			
 
				+            pending_tasks = set()
			
 
				+            for i, j, inputs_ij, grad_outputs_ij in zip(alive_ii.cpu().numpy(), alive_jj.cpu().numpy(),
			
 
				+                                                        inputs_per_expert, grad_outputs_per_expert):
			
 
				+                pending_tasks.add(asyncio.create_task(
			
 
				+                    cls._backward_one_expert((i, j), expert_per_sample[i.item()][j.item()], inputs_ij, grad_outputs_ij)
			
 
				+                ))
			
 
				+
			
 
				+            backward_survivor_indices, survivor_grad_inputs = await cls._wait_for_responses(
			
 
				+                pending_tasks, num_samples, backward_k_min, backward_timeout, timeout_after_k_min)
			
 
				+
			
 
				+            # assemble responses
			
 
				+            backward_survivor_ii, backward_survivor_jj = map(torch.as_tensor, zip(*backward_survivor_indices))
			
 
				+            survivor_grad_inputs_stacked = list(map(torch.cat, zip(*survivor_grad_inputs)))
			
 
				+            # list of torch tensors, where i-th tensor is of shape [num_backward_survivors, *flat_inputs[i].shape]
			
 
				+
			
 
				+            grad_inputs = []
			
 
				+            for i, survivor_grad_stacked in enumerate(survivor_grad_inputs_stacked):
			
 
				+                grad_input_per_expert = torch.zeros(  # gradient tensor with individual contributions from each expert
			
 
				+                    (num_samples, max_experts, *flat_inputs[i].shape[1:]),
			
 
				+                    device=survivor_grad_stacked.device, dtype=survivor_grad_stacked.dtype)
			
 
				+                grad_input_per_expert[backward_survivor_ii, backward_survivor_jj] = survivor_grad_stacked
			
 
				+
			
 
				+                grad_inputs.append(grad_input_per_expert.sum(dim=1))  # add up gradients from each expert
			
 
				+
			
 
				+            return (DUMMY, None, None, None, None, None, None, None, *grad_inputs)
			
 
				+        return loop.run_until_complete(_backward())
			
 
				+
			
 
				+    @staticmethod
			
 
				+    async def _forward_one_expert(grid_indices: Tuple[int, ...], expert: RemoteExpert, inputs: Tuple[torch.Tensor]):
			
 
				+        stub: runtime_grpc.ConnectionHandlerStub = _get_expert_stub(expert.endpoint, aio=True)
			
 
				+        try:
			
 
				+            outputs = await stub.forward(runtime_pb2.ExpertRequest(
			
 
				+                uid=expert.uid, tensors=[serialize_torch_tensor(tensor) for tensor in inputs]))
			
 
				+            return grid_indices, tuple(deserialize_torch_tensor(tensor) for tensor in outputs.tensors)
			
 
				+        except grpc.experimental.aio.AioRpcError as error:
			
 
				+            logger.warning(f"RemoteExpert {expert} failed forward: {error.code()} (inputs: {inputs})")
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _run_expert_forward(expert: RemoteExpert, *args: torch.Tensor, **kwargs: torch.Tensor):
			
 
				-        """ Call remote expert and return flattened outputs. Compatible with concurrent autograd. """
			
 
				-        return run_isolated_forward(_RemoteModuleCall, DUMMY, expert.uid, expert.stub, *nested_flatten((args, kwargs)))
			
 
				+    async def _backward_one_expert(grid_indices: Tuple[int, ...], expert: RemoteExpert,
			
 
				+                                   inputs: Tuple[torch.Tensor], grad_outputs: Tuple[torch.Tensor]):
			
 
				+        stub: runtime_grpc.ConnectionHandlerStub = _get_expert_stub(expert.endpoint, aio=True)
			
 
				+        payload = tuple(nested_flatten((inputs, grad_outputs)))
			
 
				+        try:
			
 
				+            grad_inputs = await stub.backward(runtime_pb2.ExpertRequest(
			
 
				+                uid=expert.uid, tensors=[serialize_torch_tensor(tensor) for tensor in payload]))
			
 
				+            return grid_indices, tuple(deserialize_torch_tensor(tensor) for tensor in grad_inputs.tensors)
			
 
				+        except grpc.experimental.aio.AioRpcError as error:
			
 
				+            logger.warning(f"RemoteExpert {expert} failed backward: {error.code()} ({inputs}, {grad_outputs})")
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _run_expert_backward(ctx: EmulatedAutogradContext, weight: torch.Tensor, *grad_outputs: torch.Tensor):
			
 
				-        backward_result = run_isolated_backward(_RemoteModuleCall, ctx, *(grad * weight for grad in grad_outputs))
			
 
				-        grad_dummy, no_grad_uid, no_grad_stub, *grad_inputs = backward_result
			
 
				-        return grad_inputs
			
 
				+    async def _wait_for_responses(
			
 
				+            pending_tasks: Set[Awaitable[Tuple[Tuple[int, int], Tuple[torch.Tensor, ...]]]],
			
 
				+            num_samples: int, k_min: int, timeout_total: Optional[float], timeout_after_k_min: Optional[float]
			
 
				+            ) -> Tuple[List[Tuple[int, int]], List[Tuple[torch.Tensor, ...]]]:
			
 
				+        """ await up to k_min results and any result submitted within timeout_after_k_min, cancel stragglers """
			
 
				+        timeout_total = float('inf') if timeout_total is None else timeout_total
			
 
				+        timeout_after_k_min = float('inf') if timeout_after_k_min is None else timeout_after_k_min
			
 
				+        num_successful_tasks = [0 for _ in range(num_samples)]
			
 
				+        pending_samples = num_samples  # samples for which we have less than k_min results
			
 
				+        finished_indices, finished_outputs = [], []
			
 
				+        t_finish = time.perf_counter() + timeout_total
			
 
				+
			
 
				+        while pending_tasks and time.perf_counter() <= t_finish:
			
 
				+            finished_tasks, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED,
			
 
				+                                                               timeout=t_finish - time.perf_counter())
			
 
				+            for task in finished_tasks:
			
 
				+                if not task.result():
			
 
				+                    continue
			
 
				+                task_indices, task_flat_outputs = await task
			
 
				+                finished_indices.append(task_indices)
			
 
				+                finished_outputs.append(task_flat_outputs)
			
 
				+
			
 
				+                sample_index = task_indices[0]
			
 
				+                num_successful_tasks[sample_index] += 1
			
 
				+                if num_successful_tasks[sample_index] == k_min:
			
 
				+                    pending_samples -= 1
			
 
				+                    if pending_samples <= 0:  # all tasks finished, await stragglers for at most timeout_after_k_min
			
 
				+                        t_finish = min(t_finish, time.perf_counter() + timeout_after_k_min)
			
 
				+
			
 
				+        for task in pending_tasks:
			
 
				+            task.cancel()
			
 
				+        return finished_indices, finished_outputs
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -16,9 +16,9 @@ import asyncio
 
				 import ctypes
			
 
				 import multiprocessing as mp
			
 
				 import warnings
			
 
				-from collections import deque
			
 
				+from collections import deque, OrderedDict
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				-from typing import List, Optional, Sequence
			
 
				+from typing import List, Tuple, Optional, Sequence, OrderedDict as TOrderedDict, Union, Awaitable
			
 
				 
			
 
				 import uvloop
			
 
				 
			
@@ -126,17 +126,17 @@ class DHT(mp.Process):
 
				         return self._port.value if self._port.value != 0 else None
			
 
				 
			
 
				     def get_experts(self, uids: List[str], expiration_time: Optional[DHTExpiration] = None,
			
 
				-                    wait=True) -> List[Optional[RemoteExpert]]:
			
 
				+                    return_future=False) -> List[Optional[RemoteExpert]]:
			
 
				         """
			
 
				         :param uids: find experts with these ids from across the DHT
			
 
				         :param expiration_time: if specified, return experts that expire no sooner than this (based on get_dht_time)
			
 
				-        :param wait: if True (default), return when experts are returned. Otherwise return a Future.
			
 
				+        :param return_future: if False (default), return when experts are returned. Otherwise return MPFuture.
			
 
				         :returns: a list of [RemoteExpert if found else None]
			
 
				         """
			
 
				         assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				         future, _future = MPFuture.make_pair()
			
 
				         self.pipe.send(('_get_experts', [], dict(uids=uids, expiration_time=expiration_time, future=_future)))
			
 
				-        return future.result() if wait else future
			
 
				+        return future if return_future else future.result()
			
 
				 
			
 
				     async def _get_experts(
			
 
				             self, node: DHTNode, uids: List[str], expiration_time: Optional[DHTExpiration], future: MPFuture):
			
@@ -144,8 +144,8 @@ class DHT(mp.Process):
 
				             expiration_time = get_dht_time()
			
 
				         num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
			
 
				         response = await node.get_many(uids, expiration_time, num_workers=num_workers)
			
 
				-        future.set_result([RemoteExpert(uid, maybe_endpoint) if maybe_expiration_time else None
			
 
				-                           for uid, (maybe_endpoint, maybe_expiration_time) in response.items()])
			
 
				+        future.set_result([RemoteExpert(**expert_data) if maybe_expiration_time else None
			
 
				+                           for uid, (expert_data, maybe_expiration_time) in response.items()])
			
 
				 
			
 
				     def declare_experts(self, uids: List[str], endpoint: Endpoint, wait=True, timeout=None) -> Optional[List[bool]]:
			
 
				         """
			
@@ -172,14 +172,16 @@ class DHT(mp.Process):
 
				             uid_parts = uid.split(self.UID_DELIMITER)
			
 
				             for i in range(len(uid_parts)):
			
 
				                 uid_prefix_i = self.UID_DELIMITER.join(uid_parts[:i + 1])
			
 
				-                data_to_store[uid_prefix_i] = endpoint
			
 
				+                data_to_store[uid_prefix_i] = {'uid': uid, 'endpoint': endpoint}
			
 
				 
			
 
				         store_keys, store_values = zip(*data_to_store.items())
			
 
				         store_ok = await node.store_many(store_keys, store_values, expiration_time, num_workers=num_workers)
			
 
				         if future is not None:
			
 
				             future.set_result([store_ok[key] for key in data_to_store.keys()])
			
 
				 
			
 
				-    def first_k_active(self, uid_prefixes: List[str], k: int, max_prefetch: int = 1, chunk_size: Optional[int] = None):
			
 
				+    def first_k_active(
			
 
				+            self, uid_prefixes: List[str], k: int, max_prefetch: int = 1, chunk_size: Optional[int] = None,
			
 
				+            return_future=False) -> Union[TOrderedDict[str, RemoteExpert], Awaitable[TOrderedDict[str, RemoteExpert]]]:
			
 
				         """
			
 
				         Find k prefixes with active experts; may return less if there aren't enough; used for DMoE beam search
			
 
				 
			
@@ -187,20 +189,22 @@ class DHT(mp.Process):
 
				         :param k: return at most *this many* active prefixes
			
 
				         :param max_prefetch: pre-dispatch up to *this many* tasks (each for chunk_size experts)
			
 
				         :param chunk_size: dispatch this many requests in one task
			
 
				-        :returns: a list of at most :k: prefixes that have at least one active expert each;
			
 
				+        :param return_future: if False (default), return when experts are returned. Otherwise return MPFuture.
			
 
				+        :returns: a ordered dict{uid_prefix -> RemoteExpert} mapping at most :k: prefixes to matching experts
			
 
				+            The keys in the returned dict are ordered same as in uid_prefixes.
			
 
				         """
			
 
				         assert not isinstance(uid_prefixes, str), "please provide a list/tuple of prefixes as the first argument"
			
 
				         future, _future = MPFuture.make_pair()
			
 
				         self.pipe.send(('_first_k_active', [],
			
 
				                         dict(uid_prefixes=uid_prefixes, k=k, max_prefetch=max_prefetch,
			
 
				                              chunk_size=chunk_size or k, future=_future)))
			
 
				-        return future.result()
			
 
				+        return future if return_future else future.result()
			
 
				 
			
 
				     async def _first_k_active(
			
 
				             self, node: DHTNode, uid_prefixes: List[str], k: int, max_prefetch: int, chunk_size: int, future: MPFuture):
			
 
				         num_workers_per_chunk = min(chunk_size, self.max_workers or chunk_size)
			
 
				         total_chunks = (len(uid_prefixes) - 1) // chunk_size + 1
			
 
				-        active_prefixes = []
			
 
				+        found: List[Tuple[str, RemoteExpert]] = []
			
 
				 
			
 
				         pending_tasks = deque(
			
 
				             asyncio.create_task(node.get_many(uid_prefixes[chunk_i * chunk_size: (chunk_i + 1) * chunk_size],
			
@@ -212,14 +216,13 @@ class DHT(mp.Process):
 
				             # parse task results in chronological order, launch additional tasks on demand
			
 
				             response = await pending_tasks.popleft()
			
 
				             for uid_prefix in uid_prefixes[chunk_i * chunk_size: (chunk_i + 1) * chunk_size]:
			
 
				-                if response[uid_prefix][1] is not None:  # found active peer
			
 
				-                    active_prefixes.append(uid_prefix)
			
 
				+                maybe_expert_data, maybe_expiration_time = response[uid_prefix]
			
 
				+                if maybe_expiration_time is not None:  # found active peer
			
 
				+                    found.append((uid_prefix, RemoteExpert(**maybe_expert_data)))
			
 
				                     # if we found enough active experts, finish immediately
			
 
				-                    if len(active_prefixes) >= k:
			
 
				+                    if len(found) >= k:
			
 
				                         break
			
 
				-            if len(active_prefixes) >= k:
			
 
				-                for task in pending_tasks:
			
 
				-                    task.cancel()
			
 
				+            if len(found) >= k:
			
 
				                 break
			
 
				 
			
 
				             pre_dispatch_chunk_i = chunk_i + len(pending_tasks) + 1
			
@@ -228,5 +231,8 @@ class DHT(mp.Process):
 
				                     uid_prefixes[pre_dispatch_chunk_i * chunk_size: (pre_dispatch_chunk_i + 1) * chunk_size],
			
 
				                     num_workers=num_workers_per_chunk)))
			
 
				 
			
 
				+        for task in pending_tasks:
			
 
				+            task.cancel()
			
 
				+
			
 
				         # return k active prefixes or as many as we could find
			
 
				-        future.set_result(active_prefixes)
			
 
				+        future.set_result(OrderedDict(found))
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -67,7 +67,7 @@ class DHTNode:
 
				         :param depth_modulo: split full k-bucket if it contains root OR up to the nearest multiple of this value (≈b)
			
 
				         :param parallel_rpc: maximum number of concurrent outgoing RPC requests emitted by DHTProtocol
			
 
				           Reduce this value if your RPC requests register no response despite the peer sending the response.
			
 
				-        :param wait_timeout: a kademlia rpc request is deemed lost if we did not recieve a reply in this many seconds
			
 
				+        :param wait_timeout: a kademlia rpc request is deemed lost if we did not receive a reply in this many seconds
			
 
				         :param refresh_timeout: refresh buckets if no node from that bucket was updated in this many seconds
			
 
				           if staleness_timeout is None, DHTNode will not refresh stale buckets (which is usually okay)
			
 
				         :param bootstrap_timeout: after one of peers responds, await other peers for at most this many seconds
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -41,8 +41,8 @@ class Server(threading.Thread):
 
				         super().__init__()
			
 
				         self.dht, self.experts, self.update_period = dht, expert_backends, update_period
			
 
				         if get_port(listen_on) is None:
			
 
				-            self.listen_on = listen_on = replace_port(listen_on, new_port=find_open_port())
			
 
				-        self.port = get_port(listen_on)
			
 
				+            listen_on = replace_port(listen_on, new_port=find_open_port())
			
 
				+        self.listen_on, self.port = listen_on, get_port(listen_on)
			
 
				 
			
 
				         self.conn_handlers = [ConnectionHandler(listen_on, self.experts) for _ in range(num_connection_handlers)]
			
 
				         if checkpoint_dir is not None:
			
--- a/hivemind/utils/__init__.py
+++ b/hivemind/utils/__init__.py
@@ -4,6 +4,5 @@ from hivemind.utils.tensor_descr import *
 
				 from hivemind.utils.serializer import *
			
 
				 from hivemind.utils.mpfuture import *
			
 
				 from hivemind.utils.threading import *
			
 
				-from hivemind.utils.autograd import *
			
 
				 from hivemind.utils.grpc import *
			
 
				 from hivemind.utils.logging import get_logger
			
--- a/hivemind/utils/autograd.py
+++ b/hivemind/utils/autograd.py
@@ -1,100 +0,0 @@
 
				-"""
			
 
				-Temporary autograd extensions to enable inter-op parallelism during backward pass
			
 
				-Note: we should get rid of this module if https://github.com/pytorch/pytorch/pull/33157 reaches a pytorch release
			
 
				-"""
			
 
				-from itertools import chain
			
 
				-from typing import Tuple, Any
			
 
				-from concurrent.futures import Future
			
 
				-
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.autograd.function
			
 
				-
			
 
				-from hivemind.utils.threading import run_in_background
			
 
				-
			
 
				-
			
 
				-class EmulatedAutogradContext(torch.autograd.function._ContextMethodMixin):
			
 
				-    """
			
 
				-    A special class that pretends to be pytorch autograd context. Used to circumvent limitatons of pytorch autograd,
			
 
				-    such as running several parallel backwards or transferring backward to a separate device.
			
 
				-    This class is not tested outside its use cases in RemoteMixtureOfExperts and we do not recommend using it elsewhere.
			
 
				-    """
			
 
				-
			
 
				-    @property
			
 
				-    def saved_tensors(self):
			
 
				-        return tuple(self.to_save)
			
 
				-
			
 
				-
			
 
				-def run_isolated_forward(func: torch.autograd.Function, *args) -> Tuple[EmulatedAutogradContext, Any]:
			
 
				-    """
			
 
				-    run :func: in a detached pytorch graph, return *detached* function outputs and an EmulatedAutogradContext that
			
 
				-    can be used to run backward through the same graph (performed manually by the user).
			
 
				-    """
			
 
				-    ctx = EmulatedAutogradContext()
			
 
				-    # create detached copies of every input so that we can differentiate w.r.t. them without modifying actual variables
			
 
				-    args = tuple(x.detach().requires_grad_(x.requires_grad) if isinstance(x, torch.Tensor) else x for x in args)
			
 
				-    with torch.no_grad():
			
 
				-        return ctx, func.forward(ctx, *args)
			
 
				-
			
 
				-
			
 
				-def run_isolated_backward(func: torch.autograd.Function, ctx: EmulatedAutogradContext, *grad_outputs):
			
 
				-    """
			
 
				-    run backward pass for :func: in an isolated graph that was previously created through run_isolated_forward
			
 
				-    """
			
 
				-    with torch.no_grad():
			
 
				-        return func.backward(ctx, *grad_outputs)
			
 
				-
			
 
				-
			
 
				-def map_with_parallel_backward(
			
 
				-        func: torch.autograd.Function, *args_per_call: Tuple[torch.Tensor, ...]) -> Tuple[Tuple[torch.Tensor, ...]]:
			
 
				-    """
			
 
				-    Apply an autograd function to several sets of inputs with two extra guarantees:
			
 
				-    (1) both forward and backward pass happens concurrently for each set of inputs
			
 
				-    (2) any operation dependent on any individual function will wait for all functions to finish
			
 
				-    :param func: torch autograd function to be called several times in parallel
			
 
				-    :param args_per_call: a sequence of tuples of arguments, each tuple corresponds to one function call
			
 
				-    :returns: a tuple of outputs from each func call
			
 
				-
			
 
				-    Note: this function currently requires that all :func: calls succeed (i.e. do not raise an exception).
			
 
				-    """
			
 
				-    arg_counts = list(map(len, args_per_call))
			
 
				-    assert len(set(arg_counts)) == 1, "All input sets must have the same number of arguments"
			
 
				-    output_strides_ph = Future()
			
 
				-    flat_outputs: Tuple[torch.Tensor, ...] = _ParallelApplyFunction.apply(
			
 
				-        func, len(args_per_call), arg_counts[0], output_strides_ph, *chain(*args_per_call))
			
 
				-    output_strides = output_strides_ph.result()
			
 
				-    return tuple(flat_outputs[output_strides[i]: output_strides[i + 1]] for i in range(len(output_strides) - 1))
			
 
				-
			
 
				-
			
 
				-class _ParallelApplyFunction(torch.autograd.Function):
			
 
				-    """
			
 
				-    A special torch autograd function that runs another function several times in parallel.
			
 
				-    Please do not call this function directly. Use apply_with_parallel_backward instead.
			
 
				-    Unlike default pytorch behavior, the backward pass for each function will also happen in parallel.
			
 
				-    """
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def forward(ctx, func: torch.autograd.Function, num_calls: int, num_args_per_call: int,
			
 
				-                output_strides_ph: Future, *args_flat) -> Tuple[torch.Tensor, ...]:
			
 
				-        assert num_calls * num_args_per_call == len(args_flat)
			
 
				-        args_per_call = [args_flat[i * num_args_per_call: (i + 1) * num_args_per_call] for i in range(num_calls)]
			
 
				-
			
 
				-        futures = [run_in_background(run_isolated_forward, func, *args) for args in args_per_call]
			
 
				-
			
 
				-        contexts, outputs = zip(*[future.result() for future in futures])
			
 
				-        output_strides = np.cumsum([0] + list(map(len, outputs)))
			
 
				-        ctx._inner_func = func
			
 
				-        ctx._call_contexts = contexts
			
 
				-        ctx._output_strides = output_strides
			
 
				-        output_strides_ph.set_result(output_strides)
			
 
				-        return tuple(chain(*outputs))
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def backward(ctx, *grad_outputs_flat: torch.Tensor):
			
 
				-        func, contexts, output_strides = ctx._inner_func, ctx._call_contexts, ctx._output_strides
			
 
				-        grad_outputs_per_call = [grad_outputs_flat[output_strides[i]: output_strides[i + 1]]
			
 
				-                                 for i in range(len(contexts))]
			
 
				-        futures = [run_in_background(run_isolated_backward, func, context, *grads)
			
 
				-                   for context, grads in zip(contexts, grad_outputs_per_call)]
			
 
				-        flat_grads_wrt_input = tuple(grad for future in futures for grad in future.result())
			
 
				-        return (None, None, None, None, *flat_grads_wrt_input)
			
--- a/hivemind/utils/networking.py
+++ b/hivemind/utils/networking.py
@@ -1,5 +1,4 @@
 
				 import socket
			
 
				-import urllib.parse
			
 
				 from contextlib import closing
			
 
				 from typing import Optional
			
 
				 
			
--- a/hivemind/utils/serializer.py
+++ b/hivemind/utils/serializer.py
@@ -40,7 +40,7 @@ class PytorchSerializer(SerializerBase):
 
				 class MSGPackSerializer(SerializerBase):
			
 
				     @staticmethod
			
 
				     def dumps(obj: object) -> bytes:
			
 
				-        return umsgpack.dumps(obj, use_bin_type=False)  # TODO strict https://github.com/msgpack/msgpack-python/pull/158
			
 
				+        return umsgpack.dumps(obj, use_bin_type=False, strict_types=True)
			
 
				 
			
 
				     @staticmethod
			
 
				     def loads(buf: bytes) -> object:
			
--- a/hivemind/utils/threading.py
+++ b/hivemind/utils/threading.py
@@ -1,7 +1,5 @@
 
				 import os
			
 
				-from concurrent.futures import Future, as_completed, TimeoutError, ThreadPoolExecutor
			
 
				-import time
			
 
				-from typing import Optional, List
			
 
				+from concurrent.futures import Future, ThreadPoolExecutor
			
 
				 
			
 
				 EXECUTOR_PID, GLOBAL_EXECUTOR = None, None
			
 
				 
			
@@ -13,50 +11,3 @@ def run_in_background(func: callable, *args, **kwargs) -> Future:
 
				         GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=os.environ.get("HIVEMIND_THREADS", float('inf')))
			
 
				         EXECUTOR_PID = os.getpid()
			
 
				     return GLOBAL_EXECUTOR.submit(func, *args, **kwargs)
			
 
				-
			
 
				-
			
 
				-def run_and_await_k(jobs: List[callable], k: int,
			
 
				-                    timeout_after_k: Optional[float] = 0, timeout_total: Optional[float] = None):
			
 
				-    """
			
 
				-    Runs all :jobs: asynchronously, awaits for at least k of them to finish
			
 
				-    :param jobs: functions to call asynchronously
			
 
				-    :param k: how many functions should finish for call to be successful
			
 
				-    :param timeout_after_k: after reaching k finished jobs, wait for this long before cancelling
			
 
				-    :param timeout_total: if specified, terminate cancel jobs after this many seconds
			
 
				-    :returns: a list of either results or exceptions for each job
			
 
				-    """
			
 
				-    jobs = list(jobs)
			
 
				-    assert k <= len(jobs), f"Can't await {k} out of {len(jobs)} jobs."
			
 
				-    start_time = time.time()
			
 
				-    future_to_ix = {run_in_background(job): i for i, job in enumerate(jobs)}
			
 
				-    outputs = [None] * len(jobs)
			
 
				-    success_count = 0
			
 
				-
			
 
				-    try:
			
 
				-        # await first k futures for as long as it takes
			
 
				-        for future in as_completed(list(future_to_ix.keys()), timeout=timeout_total):
			
 
				-            success_count += int(not future.exception())
			
 
				-            outputs[future_to_ix.pop(future)] = future.result() if not future.exception() else future.exception()
			
 
				-            if success_count >= k:
			
 
				-                break  # we have enough futures to succeed
			
 
				-            if len(outputs) + len(future_to_ix) < k:
			
 
				-                failed = len(jobs) - len(outputs) - len(future_to_ix)
			
 
				-                raise ValueError(f"Couldn't get enough results: too many jobs failed ({failed} / {len(outputs)})")
			
 
				-
			
 
				-        # await stragglers for at most self.timeout_after_k_min or whatever time is left
			
 
				-        if timeout_after_k is not None and timeout_total is not None:
			
 
				-            time_left = min(timeout_after_k, timeout_total - time.time() + start_time)
			
 
				-        else:
			
 
				-            time_left = timeout_after_k if timeout_after_k is not None else timeout_total
			
 
				-        for future in as_completed(list(future_to_ix.keys()), timeout=time_left):
			
 
				-            success_count += int(not future.exception())
			
 
				-            outputs[future_to_ix.pop(future)] = future.result() if not future.exception() else future.exception()
			
 
				-
			
 
				-    except TimeoutError:
			
 
				-        if len(outputs) < k:
			
 
				-            raise TimeoutError(f"Couldn't get enough results: time limit exceeded (got {len(outputs)} of {k})")
			
 
				-    finally:
			
 
				-        for future, index in future_to_ix.items():
			
 
				-            future.cancel()
			
 
				-            outputs[index] = future.result() if not future.exception() else future.exception()
			
 
				-    return outputs
			
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -1,7 +1,6 @@
 
				 import time
			
 
				 import argparse
			
 
				 import random
			
 
				-from typing import Tuple
			
 
				 from warnings import warn
			
 
				 import hivemind
			
 
				 from tqdm import trange
			
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -290,14 +290,14 @@ def test_hivemind_dht():
 
				     assert you_found.endpoint == f'that_host:{that_guys_port}'
			
 
				 
			
 
				     # test first_k_active
			
 
				-    assert theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10) == expert_uids[:10]
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10)) == expert_uids[:10]
			
 
				 
			
 
				     some_permuted_experts = random.sample(expert_uids, k=32)
			
 
				-    assert theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=32) == some_permuted_experts
			
 
				-    assert theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=1) == some_permuted_experts[:1]
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=32)) == some_permuted_experts
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=1)) == some_permuted_experts[:1]
			
 
				     fake_and_real_experts = list(chain(*zip(
			
 
				         [str(uuid.uuid4()) for _ in some_permuted_experts], some_permuted_experts)))
			
 
				-    assert theguyshetoldyounottoworryabout.first_k_active(fake_and_real_experts, k=9) == some_permuted_experts[:9]
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(fake_and_real_experts, k=9)) == some_permuted_experts[:9]
			
 
				 
			
 
				     for peer in peers:
			
 
				         peer.shutdown()
			
@@ -305,10 +305,35 @@ def test_hivemind_dht():
 
				 
			
 
				 def test_dht_single_node():
			
 
				     node = hivemind.DHT(start=True)
			
 
				-    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:{1337}"))
			
 
				+    assert node.first_k_active(['e3', 'e2'], k=3) == {}
			
 
				+    assert node.get_experts(['e3', 'e2']) == [None, None]
			
 
				+
			
 
				+    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
 
				     for expert in node.get_experts(['e3', 'e2']):
			
 
				-        assert expert.endpoint == f"{hivemind.LOCALHOST}:{1337}"
			
 
				-    assert node.first_k_active(['e0', 'e1', 'e3', 'e5', 'e2'], k=2) == ['e1', 'e3']
			
 
				+        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+    active_found = node.first_k_active(['e0', 'e1', 'e3', 'e5', 'e2'], k=2)
			
 
				+    assert list(active_found.keys()) == ['e1', 'e3']
			
 
				+    assert all(expert.uid.startswith(prefix) for prefix, expert in active_found.items())
			
 
				+
			
 
				+    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
 
				+
			
 
				+
			
 
				+def test_first_k_active():
			
 
				+    node = hivemind.DHT(start=True)
			
 
				+    assert all(node.declare_experts(['e.1.2.3', 'e.1.2.4', 'e.3.4.5'], endpoint=f"{hivemind.LOCALHOST}:1337"))
			
 
				+    assert all(node.declare_experts(['e.2.1.1'], endpoint=f"{hivemind.LOCALHOST}:1338"))
			
 
				+
			
 
				+    results = node.first_k_active(['e.0', 'e.1', 'e.2', 'e.3'], k=2)
			
 
				+    assert len(results) == 2 and next(iter(results.keys())) == 'e.1'
			
 
				+    assert results['e.1'].uid in ('e.1.2.3', 'e.1.2.4') and results['e.1'].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+    assert results['e.2'].uid == 'e.2.1.1' and results['e.2'].endpoint == f"{hivemind.LOCALHOST}:1338"
			
 
				+
			
 
				+    results = node.first_k_active(['e', 'e.1', 'e.1.2', 'e.1.2.3'], k=10)
			
 
				+    assert len(results) == 4
			
 
				+    assert 'e' in results
			
 
				+    for k in ('e.1', 'e.1.2', 'e.1.2.3'):
			
 
				+        assert results[k].uid in ('e.1.2.3', 'e.1.2.4') and results[k].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+
			
 
				 
			
 
				 
			
 
				 def test_store():
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -1,46 +1,127 @@
 
				+import asyncio
			
 
				+
			
 
				+import grpc
			
 
				+import numpy as np
			
 
				+import pytest
			
 
				 import torch
			
 
				 import hivemind
			
 
				+from hivemind.client.expert import DUMMY
			
 
				 from test_utils.run_server import background_server
			
 
				 
			
 
				 
			
 
				-def test_remote_module_call():
			
 
				-    """ Check that remote_module_call returns correct outputs and gradients if called directly """
			
 
				-    num_experts = 8
			
 
				+def test_moe():
			
 
				+    all_expert_uids = [f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}'
			
 
				+                       for _ in range(20)]
			
 
				+    with background_server(expert_uids=all_expert_uids, device='cpu', expert_cls='ffn',
			
 
				+                           num_handlers=1, hidden_dim=16) as (server_endpoint, dht_endpoint):
			
 
				+
			
 
				+        dht = hivemind.DHT(start=True, expiration=999, initial_peers=[dht_endpoint])
			
 
				+        # declare expert uids. Server *should* declare them by itself, but it takes time.
			
 
				+        assert all(dht.declare_experts(all_expert_uids, endpoint=server_endpoint))
			
 
				+
			
 
				+        dmoe = hivemind.RemoteMixtureOfExperts(
			
 
				+            in_features=16, grid_size=(32, 32, 32), dht=dht, k_best=3, uid_prefix='ffn')
			
 
				+
			
 
				+        for i in range(10):
			
 
				+            out = dmoe(torch.randn(10, 16))
			
 
				+            out.sum().backward()
			
 
				+
			
 
				+
			
 
				+def test_call_many():
			
 
				     k_min = 1
			
 
				     timeout_after_k_min = None
			
 
				     backward_k_min = 1
			
 
				-    timeout_total = None
			
 
				+    forward_timeout = None
			
 
				     backward_timeout = None
			
 
				     rtol = 1e-3
			
 
				     atol = 1e-6
			
 
				 
			
 
				-    xx = torch.randn(32, 1024, requires_grad=True)
			
 
				-    logits = torch.randn(3, requires_grad=True)
			
 
				-    random_proj = torch.randn_like(xx)
			
 
				+    with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=64,
			
 
				+                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				 
			
 
				-    with background_server(num_experts=num_experts, device='cpu', num_handlers=1,
			
 
				+        inputs = torch.randn(4, 64, requires_grad=True)
			
 
				+        inputs_clone = inputs.clone().detach().requires_grad_(True)
			
 
				+        e0, e1, e2, e3, e4 = [hivemind.RemoteExpert(f'expert.{i}', server_endpoint) for i in range(5)]
			
 
				+        e5 = hivemind.RemoteExpert(f'thisshouldnotexist', '127.0.0.1:80')
			
 
				+
			
 
				+        mask, expert_outputs = hivemind.client.moe._RemoteCallMany.apply(
			
 
				+            DUMMY, [[e0, e1, e2], [e2, e4], [e1, e5, e3], []],
			
 
				+            k_min, backward_k_min, timeout_after_k_min, forward_timeout, backward_timeout,
			
 
				+            asyncio.new_event_loop(), inputs
			
 
				+        )
			
 
				+        assert mask.shape == (4, 3)
			
 
				+        assert expert_outputs.shape == (4, 3, 64)
			
 
				+
			
 
				+        assert np.all(mask.data.numpy() == np.array([[True, True, True],
			
 
				+                                                     [True, True, False],
			
 
				+                                                     [True, False, True],
			
 
				+                                                     [False, False, False]])), f"Incorrect mask, {mask}"
			
 
				+
			
 
				+        reference_outputs = torch.zeros_like(expert_outputs)
			
 
				+        reference_outputs[0, 0] = e0(inputs_clone[0:1])
			
 
				+        reference_outputs[0, 1] = e1(inputs_clone[0:1])
			
 
				+        reference_outputs[0, 2] = e2(inputs_clone[0:1])
			
 
				+        reference_outputs[1, 0] = e2(inputs_clone[1:2])
			
 
				+        reference_outputs[1, 1] = e4(inputs_clone[1:2])
			
 
				+        reference_outputs[2, 0] = e1(inputs_clone[2:3])
			
 
				+        reference_outputs[2, 2] = e3(inputs_clone[2:3])
			
 
				+
			
 
				+        assert torch.allclose(expert_outputs, reference_outputs, rtol, atol)
			
 
				+        proj = torch.randn(4, 64)
			
 
				+        loss = (expert_outputs[(0, 1, 1, 2), (0, 2, 1, 0)] * proj).sum()
			
 
				+        loss.backward()
			
 
				+        our_grad = inputs.grad.data.cpu().clone()
			
 
				+
			
 
				+        reference_loss = (reference_outputs[(0, 1, 1, 2), (0, 2, 1, 0)] * proj).sum()
			
 
				+        reference_loss.backward()
			
 
				+        reference_grad = inputs_clone.grad.data.cpu().clone()
			
 
				+        assert torch.allclose(our_grad, reference_grad, rtol, atol)
			
 
				+
			
 
				+
			
 
				+def test_remote_module_call():
			
 
				+    with background_server(num_experts=1, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=1024,
			
 
				                            no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				-        experts = [hivemind.RemoteExpert(uid=f'expert.{i}', endpoint=server_endpoint) for i in range(num_experts)]
			
 
				-        moe_output, = hivemind.client.moe._RemoteMoECall.apply(
			
 
				-            logits, experts[:len(logits)], k_min, timeout_after_k_min, backward_k_min, timeout_total, backward_timeout,
			
 
				-            [(None,), {}], xx)
			
 
				-
			
 
				-        grad_xx_moe, = torch.autograd.grad(torch.sum(random_proj * moe_output), xx, retain_graph=True)
			
 
				-        grad_logits_moe, = torch.autograd.grad(torch.sum(random_proj * moe_output), logits, retain_graph=True)
			
 
				-
			
 
				-        # reference outputs: call all experts manually and average their outputs with softmax probabilities
			
 
				-        probs = torch.softmax(logits, 0)
			
 
				-        outs = [expert(xx) for expert in experts[:3]]
			
 
				-        manual_output = sum(p * x for p, x in zip(probs, outs))
			
 
				-        grad_xx_manual, = torch.autograd.grad(torch.sum(random_proj * manual_output), xx, retain_graph=True)
			
 
				-        grad_xx_manual_rerun, = torch.autograd.grad(torch.sum(random_proj * manual_output), xx, retain_graph=True)
			
 
				-        grad_logits_manual, = torch.autograd.grad(torch.sum(random_proj * manual_output), logits, retain_graph=True)
			
 
				-
			
 
				-    assert torch.allclose(grad_xx_manual, grad_xx_manual_rerun, rtol, atol), "Experts are non-deterministic. The test" \
			
 
				-                                                                             " is only valid for deterministic experts"
			
 
				-    assert torch.allclose(moe_output, manual_output, rtol, atol), "_RemoteMoECall returned incorrect output"
			
 
				-    assert torch.allclose(grad_xx_moe, grad_xx_manual, rtol, atol), "incorrect gradient w.r.t. input"
			
 
				-    assert torch.allclose(grad_logits_moe, grad_logits_manual, rtol, atol), "incorrect gradient w.r.t. logits"
			
 
				+        real_expert = hivemind.RemoteExpert('expert.0', server_endpoint)
			
 
				+        fake_expert = hivemind.RemoteExpert('oiasfjiasjf', server_endpoint)
			
 
				+
			
 
				+        out1 = real_expert(torch.randn(1, 1024))
			
 
				+        assert out1.shape == (1, 1024)
			
 
				+        dummy_x = torch.randn(3, 1024, requires_grad=True)
			
 
				+        out3 = real_expert(dummy_x)
			
 
				+        assert out3.shape == (3, 1024)
			
 
				+        out3_again = real_expert(dummy_x[1:])
			
 
				+        assert torch.allclose(out3_again, out3[1:])
			
 
				+        out3_again.norm().backward()
			
 
				+        assert dummy_x.grad is not None and dummy_x.grad.norm() > 0
			
 
				+
			
 
				+        with pytest.raises(grpc.RpcError):
			
 
				+            real_expert(torch.randn(3, 11))
			
 
				+        with pytest.raises(grpc.RpcError):
			
 
				+            fake_expert(dummy_x)
			
 
				+
			
 
				+
			
 
				+def test_moe_beam_search():
			
 
				+    all_expert_uids = [f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10)]
			
 
				+    dht = hivemind.DHT(start=True, expiration=999)
			
 
				+    assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint'))
			
 
				+
			
 
				+    dmoe = hivemind.RemoteMixtureOfExperts(
			
 
				+        in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn')
			
 
				+
			
 
				+    for i in range(25):
			
 
				+        input = torch.randn(32)
			
 
				+        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)
			
 
				+
			
 
				+        chosen_experts = dmoe.loop.run_until_complete(dmoe.beam_search(grid_scores, k_best=dmoe.k_best))
			
 
				+
			
 
				+        chosen_scores = dmoe.compute_expert_scores([dim_scores[None] for dim_scores in grid_scores],
			
 
				+                                                   [chosen_experts])[0]
			
 
				+
			
 
				+        all_scores = dmoe.compute_expert_scores([dim_scores[None] for dim_scores in grid_scores],
			
 
				+                                                [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
			
 
				+        true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)]
			
 
				+        our_best_scores = list(chosen_scores.cpu().detach().numpy())
			
 
				+        assert np.allclose(true_best_scores, our_best_scores)
			
 
				 
			
 
				 
			
 
				 def test_determinism():
			
--- a/tests/test_utils/run_server.py
+++ b/tests/test_utils/run_server.py
@@ -11,7 +11,7 @@ import hivemind
 
				 from test_utils.layers import name_to_block, name_to_input
			
 
				 
			
 
				 
			
 
				-def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hidden_dim=1024,
			
 
				+def make_dummy_server(listen_on='0.0.0.0:*', num_experts=None, expert_uids=None, expert_cls='ffn', hidden_dim=1024,
			
 
				                       num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None,
			
 
				                       no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, root_port=None, verbose=True,
			
 
				                       start=False, **kwargs) -> hivemind.Server:
			
@@ -19,11 +19,12 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
				     Instantiate a server with several identical experts. See argparse comments below for details
			
 
				     :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
			
 
				     :param num_experts: run this many identical experts
			
 
				+    :param expert_prefix: all expert uids will be {expert_prefix}.{index}
			
 
				+    :param expert_offset: expert uid will use indices in range(expert_offset, expert_offset + num_experts)
			
 
				+    :param expert_uids: spawn experts with these exact uids, overrides num_experts, expert_prefix and expert_offset
			
 
				     :param expert_cls: expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
			
 
				     :param hidden_dim: main dimension for expert_cls
			
 
				     :param num_handlers: server will use this many parallel processes to handle incoming requests
			
 
				-    :param expert_prefix: all expert uids will be {expert_prefix}.{index}
			
 
				-    :param expert_offset: expert uid will use indices in range(expert_offset, expert_offset + num_experts)
			
 
				     :param max_batch_size: total num examples in the same batch will not exceed this value
			
 
				     :param device: all experts will use this device in torch notation; default: cuda if available else cpu
			
 
				     :param no_optimizer: if specified, all optimizers use learning rate=0
			
@@ -36,6 +37,8 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
				     :param verbose: whether to print server started / finished / terminated events
			
 
				     :param start: if True, starts server right away and returns when server is ready for requests
			
 
				     """
			
 
				+    assert (expert_uids is None) != (num_experts is None and expert_prefix == 'expert' and expert_offset == 0), \
			
 
				+        "Please provide either expert uids *or* (num_experts, expert_prefix and expert_offset), not both"
			
 
				     if verbose and len(kwargs) != 0:
			
 
				         print("Ignored kwargs:", kwargs)
			
 
				     assert expert_cls in name_to_block
			
@@ -68,11 +71,15 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
				         args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input),)
			
 
				 
			
 
				     # initialize experts
			
 
				+    if expert_uids is None:
			
 
				+        num_experts = num_experts if num_experts is not None else 1
			
 
				+        expert_uids = [f'{expert_prefix}{hivemind.DHT.UID_DELIMITER}{i + expert_offset}'
			
 
				+                       for i in range(num_experts)]
			
 
				+
			
 
				     experts = {}
			
 
				-    for i in range(num_experts):
			
 
				+    for expert_uid in expert_uids:
			
 
				         expert = name_to_block[expert_cls](hidden_dim)
			
 
				         opt = torch.optim.SGD(expert.parameters(), 0.0 if no_optimizer else 0.05)
			
 
				-        expert_uid = f'{expert_prefix}{hivemind.DHT.UID_DELIMITER}{i + expert_offset}'
			
 
				         experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, opt=opt,
			
 
				                                                      args_schema=args_schema,
			
 
				                                                      outputs_schema=hivemind.BatchTensorDescriptor(hidden_dim),
			
@@ -108,7 +115,7 @@ def background_server(*args, shutdown_timeout=5, verbose=True, **kwargs) -> Tupl
 
				         finally:
			
 
				             if verbose:
			
 
				                 print("Server failed to shutdown gracefully, terminating it the hard way...")
			
 
				-            runner.terminate()
			
 
				+            runner.kill()
			
 
				             if verbose:
			
 
				                 print("Server terminated.")
			
 
				 
			
@@ -132,9 +139,8 @@ def _server_runner(pipe, *args, verbose, **kwargs):
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     parser = argparse.ArgumentParser()
			
 
				-    parser.add_argument('--interface', type=str, default='0.0.0.0', required=False,
			
 
				+    parser.add_argument('--listen_on', type=str, default='0.0.0.0:*', required=False,
			
 
				                         help="'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6")
			
 
				-    parser.add_argument('--port', type=int, default=None, required=False, help="server will listen to this port")
			
 
				     parser.add_argument('--num_experts', type=int, default=1, required=False, help="run this many identical experts")
			
 
				     parser.add_argument('--expert_cls', type=str, default='ffn', required=False,
			
 
				                         help="expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'.")