4 жил өмнө · 62652e1717
--- a/docs/modules/client.rst
+++ b/docs/modules/client.rst
@@ -18,6 +18,10 @@
 
				    :members:
			
 
				    :member-order: bysource
			
 
				 
			
 
				+.. autoclass:: RemoteSwitchMixtureOfExperts
			
 
				+   :members:
			
 
				+   :member-order: bysource
			
 
				+
			
 
				 .. autoclass:: DecentralizedAverager
			
 
				    :members:
			
 
				    :member-order: bysource
			
--- a/hivemind/client/__init__.py
+++ b/hivemind/client/__init__.py
@@ -1,4 +1,5 @@
 
				 from hivemind.client.expert import RemoteExpert
			
 
				 from hivemind.client.moe import RemoteMixtureOfExperts
			
 
				+from hivemind.client.switch_moe import RemoteSwitchMixtureOfExperts
			
 
				 from hivemind.client.averaging import DecentralizedAverager
			
 
				 from hivemind.client.averaging.training import TrainingAverager
			
--- a/hivemind/client/beam_search.py
+++ b/hivemind/client/beam_search.py
@@ -63,7 +63,7 @@ class MoEBeamSearcher:
 
				          Though, this is a pathological case (e.g. only 90 experts in an oversized 100x100 grid) that should be avoided.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, dht: DHT, uid_prefix: ExpertPrefix, grid_size: Optional[Tuple[int, ...]] = None,
			
 
				+    def __init__(self, dht: DHT, uid_prefix: ExpertPrefix, grid_size: Tuple[int, ...],
			
 
				                  num_workers: Optional[int] = None, negative_caching: bool = True, **kwargs):
			
 
				         if not uid_prefix.endswith(UID_DELIMITER):
			
 
				             uid_prefix += UID_DELIMITER
			
@@ -71,6 +71,7 @@ class MoEBeamSearcher:
 
				         assert is_valid_prefix(uid_prefix), f"Prefix '{uid_prefix}' is invalid."
			
 
				         self.dht = dht
			
 
				         self.uid_prefix, self.grid_size = uid_prefix, grid_size
			
 
				+        self.total_grid_size = sum(grid_size)
			
 
				         self.negative_caching, self.num_workers, self.dht_kwargs = negative_caching, num_workers, kwargs
			
 
				 
			
 
				     def get_initial_beam(self, scores: Sequence[float], beam_size: int, return_future: bool = False
			
@@ -174,7 +175,7 @@ class MoEBeamSearcher:
 
				         :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
			
 
				         :returns: a list that contains *up to* k_best RemoteExpert instances
			
 
				         """
			
 
				-        assert (not self.grid_size or len(grid_scores) == len(self.grid_size)) and beam_size > 0
			
 
				+        assert len(grid_scores) == len(self.grid_size) and beam_size > 0
			
 
				         return self.dht.run_coroutine(partial(self._find_best_experts, prefix=self.uid_prefix, beam_size=beam_size,
			
 
				                                               grid_scores=list(grid_scores), negative_caching=self.negative_caching,
			
 
				                                               num_workers=self.num_workers), return_future)
			
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -14,7 +14,7 @@ from hivemind.client.beam_search import MoEBeamSearcher
 
				 from hivemind.client.expert import RemoteExpert, DUMMY, _get_expert_stub
			
 
				 from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
			
 
				 from hivemind.server.expert_uid import UID_DELIMITER
			
 
				-from hivemind.utils import nested_pack, nested_flatten
			
 
				+from hivemind.utils import nested_pack, nested_flatten, nested_map
			
 
				 from hivemind.utils.compression import serialize_torch_tensor, deserialize_torch_tensor
			
 
				 from hivemind.utils.logging import get_logger
			
 
				 
			
@@ -23,7 +23,7 @@ logger = get_logger(__name__)
 
				 
			
 
				 class RemoteMixtureOfExperts(nn.Module):
			
 
				     """
			
 
				-    A torch module that performs mixture of experts inference with a local gating function and multiple remote experts.
			
 
				+    A torch module that performs Mixture-of-Experts inference with a local gating function and multiple remote experts.
			
 
				     Natively supports pytorch autograd.
			
 
				 
			
 
				     :note: By default, not all experts are guaranteed to perform forward pass. Moreover, not all of those who ran
			
@@ -38,14 +38,15 @@ class RemoteMixtureOfExperts(nn.Module):
 
				     :param k_best: average this many highest-scoring experts to compute activations
			
 
				     :param k_min: make sure at least this many experts returned output (i.e. didn't fail)
			
 
				     :param timeout_after_k_min: wait for this many seconds after k_min experts returned results.
			
 
				-    :param detect_anomalies: whether to check input/output tensors for NaN and infinity values
			
 
				      Any expert that didn't manage to return output after that delay is considered unavailable
			
 
				+    :param detect_anomalies: whether to check input/output tensors for NaN and infinity values
			
 
				+    :param allow_zero_outputs: whether to return zeros if no experts respond on forward pass
			
 
				     """
			
 
				 
			
 
				     def __init__(self, *, in_features, grid_size: Tuple[int, ...], dht: hivemind.DHT, uid_prefix: str, k_best: int,
			
 
				                  k_min: int = 1, forward_timeout: Optional[float] = None, timeout_after_k_min: Optional[float] = None,
			
 
				                  backward_k_min: int = 1, backward_timeout: Optional[float] = None, detect_anomalies: bool = False,
			
 
				-                 **dht_kwargs):
			
 
				+                 allow_zero_outputs: bool = False, **dht_kwargs):
			
 
				         super().__init__()
			
 
				         self.dht = dht
			
 
				         self.beam_search = MoEBeamSearcher(dht, uid_prefix, grid_size, **dht_kwargs)
			
@@ -53,8 +54,10 @@ class RemoteMixtureOfExperts(nn.Module):
 
				         self.forward_timeout, self.backward_timeout = forward_timeout, backward_timeout
			
 
				         self.timeout_after_k_min = timeout_after_k_min
			
 
				         self.detect_anomalies = detect_anomalies
			
 
				+        self.allow_zero_outputs = allow_zero_outputs
			
 
				 
			
 
				-        self.proj = nn.Linear(in_features, sum(grid_size))  # jointly predict logits for all grid dimensions
			
 
				+        # jointly predict logits for all grid dimensions
			
 
				+        self.proj = nn.Linear(in_features, self.beam_search.total_grid_size)
			
 
				         self._expert_info = None  # expert['info'] from one of experts in the grid
			
 
				 
			
 
				     def forward(self, input: torch.Tensor, *args: torch.Tensor, **kwargs: torch.Tensor):
			
@@ -87,7 +90,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
				 
			
 
				         expert_mask, *expert_outputs = _RemoteCallMany.apply(
			
 
				             DUMMY, chosen_experts, self.k_min, self.backward_k_min, self.timeout_after_k_min, self.forward_timeout,
			
 
				-            self.backward_timeout, self.detect_anomalies, self.info, *nested_flatten(((input, *args), kwargs)))
			
 
				+            self.backward_timeout, self.detect_anomalies, self.allow_zero_outputs, self.info,
			
 
				+            *nested_flatten(((input, *args), kwargs)))
			
 
				         # ^-- multiple tensors of shape [batch_size, max_experts, ...output_shape]
			
 
				 
			
 
				         expert_logits = self.compute_expert_scores(grid_scores, chosen_experts)
			
@@ -97,6 +101,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
				         averaged_outputs_flat = [
			
 
				             (expert_weights[..., None] * tensor.flatten(start_dim=2)).view(tensor.shape).sum(dim=1)
			
 
				             for tensor in expert_outputs]  # ^-- multiply by softmax weights along first 2 axes
			
 
				+
			
 
				         return nested_pack(averaged_outputs_flat, self.info['outputs_schema'])
			
 
				 
			
 
				     def compute_expert_scores(
			
@@ -152,13 +157,14 @@ class _RemoteCallMany(torch.autograd.Function):
 
				     one expert succeeds for each input. For user-friendly version of this function, use RemoteMixtureOfExperts module.
			
 
				 
			
 
				     Note: experts that failed during forward will be assigned zero outputs and marked as mask[i, j] = 0,
			
 
				-          experts that failed during backward will be treated as constants (i.e. gradients of through them are zeros)
			
 
				+          experts that failed during backward will be treated as constants (i.e. gradients through them are zeros)
			
 
				     """
			
 
				 
			
 
				     @classmethod
			
 
				     def forward(cls, ctx, dummy, experts_per_sample: List[List[RemoteExpert]], k_min: int, backward_k_min: int,
			
 
				                 timeout_after_k_min: float, forward_timeout: Optional[float], backward_timeout: Optional[float],
			
 
				-                detect_anomalies: bool, info: Dict[str, Any], *flat_inputs: torch.Tensor) -> Tuple[torch.Tensor]:
			
 
				+                detect_anomalies: bool, allow_zero_outputs: bool, info: Dict[str, Any],
			
 
				+                *flat_inputs: torch.Tensor) -> Tuple[torch.Tensor]:
			
 
				         assert not torch.is_grad_enabled()
			
 
				         num_samples, max_experts = len(experts_per_sample), max(map(len, experts_per_sample))
			
 
				 
			
@@ -181,32 +187,42 @@ class _RemoteCallMany(torch.autograd.Function):
 
				                 new_task = stub.forward.future(runtime_pb2.ExpertRequest(uid=expert.uid, tensors=input_tensors))
			
 
				                 pending_tasks[new_task] = (i, j)
			
 
				 
			
 
				-        alive_grid_indices, alive_flat_outputs = cls._collect_responses(
			
 
				+        responded_inds, alive_flat_outputs = cls._collect_responses(
			
 
				             pending_tasks, num_samples, k_min, forward_timeout, timeout_after_k_min, detect_anomalies)
			
 
				-        if len(alive_grid_indices) == 0:
			
 
				-            raise TimeoutError("Forward pass: no alive experts responded within timeout.")
			
 
				+        if len(responded_inds) < k_min:
			
 
				+            raise TimeoutError(f"Forward pass: less than {k_min} responded within timeout.")
			
 
				+
			
 
				+        if not isinstance(info['outputs_schema'], tuple):
			
 
				+            outputs_schema = (info['outputs_schema'],)
			
 
				+        else:
			
 
				+            outputs_schema = info['outputs_schema']
			
 
				+        outputs = nested_map(
			
 
				+            lambda descriptor: descriptor.make_empty(num_samples, max_experts, device=flat_inputs[0].device).zero_(),
			
 
				+            outputs_schema)
			
 
				 
			
 
				         # assemble responses
			
 
				-        alive_ii, alive_jj = map(torch.as_tensor, zip(*alive_grid_indices))
			
 
				-        mask = torch.zeros([num_samples, max_experts], dtype=torch.bool, device=flat_inputs[0].device)
			
 
				-        mask[alive_ii, alive_jj] = True
			
 
				+        if len(responded_inds) > 0 or allow_zero_outputs:
			
 
				+            batch_inds, expert_inds = map(lambda x: torch.as_tensor(x, device=flat_inputs[0].device, dtype=torch.long),
			
 
				+                                          list(zip(*responded_inds)) or ([], []))
			
 
				 
			
 
				-        alive_flat_outputs_stacked = (torch.cat(outputs) for outputs in zip(*alive_flat_outputs))
			
 
				-        # torch tensors, i-th tensor is of shape [num_responded, *expert_outputs[i].shape]
			
 
				+            alive_flat_outputs_stacked = (torch.cat(outputs) for outputs in zip(*alive_flat_outputs))
			
 
				+            # torch tensors, i-th tensor is of shape [num_responded, *expert_outputs[i].shape]
			
 
				 
			
 
				-        outputs = []
			
 
				-        for response_stacked in alive_flat_outputs_stacked:
			
 
				-            output = torch.zeros(
			
 
				-                [num_samples, max_experts, *response_stacked.shape[1:]], device=response_stacked.device,
			
 
				-                dtype=response_stacked.dtype, requires_grad=response_stacked.requires_grad)
			
 
				-            output[alive_ii, alive_jj] = response_stacked
			
 
				-            outputs.append(output.to(flat_inputs[0].device))
			
 
				+            for output, response_stacked in zip(outputs, alive_flat_outputs_stacked):
			
 
				+                output[batch_inds, expert_inds] = response_stacked.to(output.device)
			
 
				+
			
 
				+        else:
			
 
				+            raise RuntimeError('Forward pass: 0 experts responded within timeout and allow_zero_outputs is False')
			
 
				+
			
 
				+        mask = torch.zeros([num_samples, max_experts], dtype=torch.bool, device=flat_inputs[0].device)
			
 
				+        mask[batch_inds, expert_inds] = True
			
 
				 
			
 
				         # save individual outputs for backward pass
			
 
				-        ctx.save_for_backward(alive_ii, alive_jj, *flat_inputs_cpu)
			
 
				+        ctx.save_for_backward(batch_inds, expert_inds, *flat_inputs_cpu)
			
 
				         ctx._saved_non_tensors = (info, backward_k_min, backward_timeout, timeout_after_k_min, experts_per_sample,
			
 
				                                   detect_anomalies)
			
 
				-        return (mask,) + tuple(outputs)
			
 
				+
			
 
				+        return (mask,) + outputs
			
 
				 
			
 
				     @classmethod
			
 
				     @once_differentiable
			
@@ -235,35 +251,37 @@ class _RemoteCallMany(torch.autograd.Function):
 
				         for i, j, inputs_ij, grad_outputs_ij in zip(alive_ii.cpu().numpy(), alive_jj.cpu().numpy(),
			
 
				                                                     inputs_per_expert, grad_outputs_per_expert):
			
 
				             expert = expert_per_sample[i.item()][j.item()]
			
 
				-            stub: runtime_grpc.ConnectionHandlerStub = _get_expert_stub(expert.endpoint)
			
 
				+            stub = _get_expert_stub(expert.endpoint)
			
 
				             inputs_and_grad_outputs = tuple(nested_flatten((inputs_ij, grad_outputs_ij)))
			
 
				             tensors_serialized = [serialize_torch_tensor(tensor, proto.compression)
			
 
				                                   for tensor, proto in zip(inputs_and_grad_outputs, backward_schema)]
			
 
				             new_task = stub.backward.future(runtime_pb2.ExpertRequest(uid=expert.uid, tensors=tensors_serialized))
			
 
				             pending_tasks[new_task] = (i, j)
			
 
				 
			
 
				-        backward_survivor_indices, survivor_grad_inputs = cls._collect_responses(
			
 
				+        survivor_inds, survivor_grad_inputs = cls._collect_responses(
			
 
				             pending_tasks, num_samples, backward_k_min, backward_timeout, timeout_after_k_min, detect_anomalies)
			
 
				-        if len(backward_survivor_indices) == 0:
			
 
				-            raise TimeoutError("Backward pass: no alive experts responded within timeout.")
			
 
				+        if len(survivor_inds) < backward_k_min:
			
 
				+            raise TimeoutError(f"Backward pass: less than {backward_k_min} experts responded within timeout.")
			
 
				 
			
 
				         # assemble responses
			
 
				-        backward_survivor_ii, backward_survivor_jj = map(torch.as_tensor, zip(*backward_survivor_indices) or ([], []))
			
 
				+        batch_inds, expert_inds = map(lambda x: torch.as_tensor(x, dtype=torch.long),
			
 
				+                                      list(zip(*survivor_inds)) or ([], []))
			
 
				 
			
 
				         survivor_grad_inputs_stacked = (torch.cat(grad_inputs) for grad_inputs in zip(*survivor_grad_inputs))
			
 
				         # torch tensors, i-th tensor is of shape [num_backward_survivors, *flat_inputs_cpu[i].shape]
			
 
				 
			
 
				-        grad_inputs = []
			
 
				-        for i, survivor_grad_stacked in enumerate(survivor_grad_inputs_stacked):
			
 
				+        grad_inputs = nested_map(
			
 
				+            lambda descr: descr.make_empty(num_samples, device=flat_grad_outputs[0].device).zero_(),
			
 
				+            list(nested_flatten(info['forward_schema'])))
			
 
				+
			
 
				+        for grad_input, survivor_grad_stacked in zip(grad_inputs, survivor_grad_inputs_stacked):
			
 
				             grad_input_per_expert = torch.zeros(  # gradient tensor with individual contributions from each expert
			
 
				-                (num_samples, max_experts, *flat_inputs_cpu[i].shape[1:]),
			
 
				+                (num_samples, max_experts, *grad_input.shape[1:]),
			
 
				                 device=survivor_grad_stacked.device, dtype=survivor_grad_stacked.dtype)
			
 
				-            grad_input_per_expert[backward_survivor_ii, backward_survivor_jj] = survivor_grad_stacked
			
 
				-
			
 
				-            # sum gradients from each expert
			
 
				-            grad_inputs.append(grad_input_per_expert.to(flat_grad_outputs[0].device).sum(dim=1))
			
 
				+            grad_input_per_expert[batch_inds, expert_inds] = survivor_grad_stacked
			
 
				+            grad_input.copy_(grad_input_per_expert.to(flat_grad_outputs[0].device).sum(dim=1))
			
 
				 
			
 
				-        return (DUMMY, None, None, None, None, None, None, None, None, *grad_inputs)
			
 
				+        return (DUMMY, None, None, None, None, None, None, None, None, None, *grad_inputs)
			
 
				 
			
 
				     @staticmethod
			
 
				     def _collect_responses(task_to_indices: Dict[grpc.Future, Tuple[int, int]], num_samples: int, k_min: int,
			
--- a/hivemind/client/switch_moe.py
+++ b/hivemind/client/switch_moe.py
@@ -0,0 +1,175 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+from typing import Tuple, List
			
 
				+
			
 
				+import grpc
			
 
				+import torch
			
 
				+
			
 
				+from hivemind.client.expert import RemoteExpert, DUMMY
			
 
				+from hivemind.client.moe import RemoteMixtureOfExperts, _RemoteCallMany
			
 
				+from hivemind.server.expert_uid import UID_DELIMITER
			
 
				+from hivemind.utils import nested_pack, nested_flatten
			
 
				+from hivemind.utils.logging import get_logger
			
 
				+
			
 
				+logger = get_logger(__name__)
			
 
				+
			
 
				+
			
 
				+class RemoteSwitchMixtureOfExperts(RemoteMixtureOfExperts):
			
 
				+    """
			
 
				+    A module implementing Switch Transformers [1] Mixture-of-Experts inference with remote experts.
			
 
				+
			
 
				+    [1] Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity.
			
 
				+     William Fedus, Barret Zoph, Noam Shazeer. https://arxiv.org/abs/2101.03961
			
 
				+
			
 
				+    :note: By default, not all experts are guaranteed to perform forward pass. Moreover, not all of those who ran
			
 
				+     forward pass are guaranteed to perform backward pass. In the latter case, gradient will be averaged without
			
 
				+     the missing experts
			
 
				+
			
 
				+    :param in_features: common input size for experts and gating function
			
 
				+    :param grid_size: dimensions that form expert uid (see below)
			
 
				+    :param uid_prefix: common prefix for all expert uids (must end with '.')
			
 
				+    :note: expert uid follows the pattern {uid_prefix}.{0...grid_size[0]}.{0...grid_size[1]}...{0...grid_size[-1]}
			
 
				+    :param dht: a DHT instance used to search for best experts
			
 
				+    :param k_best: average this many highest-scoring experts to compute activations
			
 
				+    :param k_min: make sure at least this many experts returned output (i.e. didn't fail)
			
 
				+    :param timeout_after_k_min: wait for this many seconds after k_min experts returned results.
			
 
				+     Any expert that didn't manage to return output after that delay is considered unavailable
			
 
				+    :param detect_anomalies: whether to check input/output tensors for NaN and infinity values
			
 
				+    :param allow_zero_outputs: whether to return just the input if no experts respond on forward pass
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, *, grid_size: Tuple[int, ...], utilization_alpha: float = 0.9, grid_dropout: float = 1.0,
			
 
				+                 jitter_eps: float = 1e-2, k_best=1, k_min=0, backward_k_min=0, allow_zero_outputs=True, **kwargs):
			
 
				+        super().__init__(grid_size=grid_size, k_best=k_best, k_min=k_min, backward_k_min=backward_k_min,
			
 
				+                         allow_zero_outputs=allow_zero_outputs, **kwargs)
			
 
				+
			
 
				+        initial_utilization = torch.cat(
			
 
				+            [torch.tensor([1 / dim_size for _ in range(dim_size)], dtype=torch.float)
			
 
				+             for dim_size in grid_size],
			
 
				+        )
			
 
				+        self.register_buffer('grid_utilization', initial_utilization)
			
 
				+        self.utilization_alpha = utilization_alpha
			
 
				+        self.grid_dropout = grid_dropout
			
 
				+        self.jitter_eps = jitter_eps
			
 
				+
			
 
				+    def forward(self, input: torch.Tensor, *args: torch.Tensor, **kwargs: torch.Tensor):
			
 
				+        if input.ndim != 2:
			
 
				+            input_for_gating = input.mean(dim=tuple(range(1, input.ndim - 1)))
			
 
				+        else:
			
 
				+            input_for_gating = input
			
 
				+
			
 
				+        # Multiplicative jitter for regularized routing
			
 
				+        jitter_noise = torch.empty_like(input_for_gating).uniform_(1 - self.jitter_eps, 1 + self.jitter_eps)
			
 
				+        input_for_gating *= jitter_noise
			
 
				+
			
 
				+        # Compute scores, find most appropriate experts with beam search
			
 
				+        grid_scores = self.proj(input_for_gating).split_with_sizes(self.beam_search.grid_size, dim=-1)
			
 
				+
			
 
				+        grid_dropout_masks = (
			
 
				+            (torch.rand(size=(dim_size,), dtype=input_for_gating.dtype, device=input_for_gating.device)
			
 
				+             < self.grid_dropout) for dim_size in self.beam_search.grid_size
			
 
				+        )
			
 
				+        grid_scores_dropout = [torch.where(dropout_mask, grid_score,
			
 
				+                                           torch.full((1,), float('-inf'), device=grid_score.device,
			
 
				+                                                      dtype=grid_score.dtype))
			
 
				+                               for grid_score, dropout_mask in zip(grid_scores, grid_dropout_masks)]
			
 
				+
			
 
				+        grid_softmax = [torch.softmax(grid_score, dim=-1) for grid_score in grid_scores_dropout]
			
 
				+        chosen_experts: List[List[RemoteExpert]] = self.beam_search.batch_find_best_experts(
			
 
				+            [scores.detach().cpu() for scores in grid_scores_dropout], self.k_best)
			
 
				+
			
 
				+        if self._expert_info is None:
			
 
				+            try:
			
 
				+                self._expert_info = next((expert.info for experts_i in chosen_experts for expert in experts_i))
			
 
				+            except grpc.RpcError as e:
			
 
				+                logger.warning(f"Failed to get RemoteMixtureOfExperts.output_shape: {e}")
			
 
				+
			
 
				+        expert_mask, *expert_outputs = _RemoteCallMany.apply(
			
 
				+            DUMMY, chosen_experts, self.k_min, self.backward_k_min, self.timeout_after_k_min, self.forward_timeout,
			
 
				+            self.backward_timeout, self.detect_anomalies, self.allow_zero_outputs, self.info,
			
 
				+            *nested_flatten(((input, *args), kwargs)))
			
 
				+        # ^-- multiple tensors of shape [batch_size, max_experts, ...output_shape]
			
 
				+
			
 
				+        batch_utilization = self._compute_batch_utilization(chosen_experts, expert_mask)
			
 
				+        self.grid_utilization = \
			
 
				+            self.utilization_alpha * self.grid_utilization + (1 - self.utilization_alpha) * batch_utilization
			
 
				+
			
 
				+        # compute expert probabilities as product across grid dimensions
			
 
				+        expert_probs = self.compute_expert_scores(grid_softmax, chosen_experts)
			
 
				+        masked_logits = torch.full((1,), float('-inf'), device=expert_probs.device, dtype=expert_probs.dtype)
			
 
				+        expert_probs = torch.where(expert_mask, expert_probs, masked_logits)
			
 
				+
			
 
				+        # multiply outputs by expert probabilities
			
 
				+        averaged_outputs_flat = [
			
 
				+            (expert_probs[..., None] * tensor.flatten(start_dim=2)).view(tensor.shape).sum(dim=1)
			
 
				+            for tensor in expert_outputs]  # ^-- multiply by softmax weights along first 2 axes
			
 
				+
			
 
				+        packed_outputs = nested_pack(averaged_outputs_flat, self.info['outputs_schema'])
			
 
				+
			
 
				+        # Load balancing loss: multiply fractions of probability mass and fractions of routed examples
			
 
				+        # for each grid dimension, sum across all indices for a dimension. Optimizing this leads to uniform allocation
			
 
				+        balancing_loss = torch.stack([torch.mean(dim_softmax.mean(0) * dim_utilization) * (dim_size ** 2)
			
 
				+                                      for dim_softmax, dim_utilization, dim_size in
			
 
				+                                      zip(grid_softmax, self.grid_utilization, self.beam_search.grid_size)]).sum()
			
 
				+
			
 
				+        # residual connection
			
 
				+        if isinstance(packed_outputs, torch.Tensor):
			
 
				+            packed_outputs = packed_outputs + input
			
 
				+        else:
			
 
				+            packed_outputs[0] = packed_outputs[0] + input
			
 
				+
			
 
				+        return packed_outputs, balancing_loss
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def _compute_batch_utilization(self, batch_experts, expert_mask):
			
 
				+        batch_utilization = [torch.zeros((dim_size,), dtype=self.grid_utilization.dtype,
			
 
				+                                         device=self.grid_utilization.device)
			
 
				+                             for dim_size in self.beam_search.grid_size]
			
 
				+
			
 
				+        # out of chosen_experts, select those for which expert_mask is True
			
 
				+        for (sample_idx, expert_idx) in expert_mask.nonzero().numpy():
			
 
				+            expert = batch_experts[sample_idx][expert_idx]
			
 
				+            expert_indices = expert.uid[len(self.beam_search.uid_prefix):]
			
 
				+            expert_indices = list(map(int, expert_indices.split(UID_DELIMITER)))
			
 
				+
			
 
				+            for dim_index, dim_utilization in zip(expert_indices, batch_utilization):
			
 
				+                dim_utilization[dim_index] += 1
			
 
				+
			
 
				+        return torch.cat([
			
 
				+            torch.nn.functional.normalize(dim_utilization, p=1, dim=0)
			
 
				+            for dim_utilization in batch_utilization
			
 
				+        ])
			
 
				+
			
 
				+    def compute_expert_scores(
			
 
				+            self, grid_probs: List[torch.Tensor], batch_experts: List[List[RemoteExpert]]) -> torch.Tensor:
			
 
				+        """
			
 
				+        Compute scores for each expert by multiplying grid probabilities, autograd-friendly
			
 
				+        :param grid_probs: list of torch tensors, i-th tensor contains scores for i-th grid dimension
			
 
				+        :param batch_experts: list(batch) of lists(k) of up to k experts selected for this batch
			
 
				+        :returns: a tensor of scores, float32[batch_size, k]
			
 
				+        :note: if some rows in batch have less than max number of experts, their scores will be padded with -inf
			
 
				+        """
			
 
				+        expert_counts = list(map(len, batch_experts))
			
 
				+        batch_size = len(batch_experts)
			
 
				+        max_num_experts = max(expert_counts)
			
 
				+        total_num_experts = sum(expert_counts)
			
 
				+        expert_index_in_batch = torch.arange(total_num_experts, device=grid_probs[0].device)
			
 
				+        expert_strides = torch.cumsum(torch.as_tensor([0] + expert_counts, device=grid_probs[0].device), dim=-1)[:-1]
			
 
				+        flat_batch_indices = (expert_index_in_batch >= expert_strides[:, None]).to(torch.int32).sum(0) - 1
			
 
				+        flat_local_indices = expert_index_in_batch - expert_strides[flat_batch_indices]
			
 
				+        flat_experts = [expert for row in batch_experts for expert in row]
			
 
				+
			
 
				+        grid_indices = torch.zeros([len(flat_experts), len(grid_probs)], dtype=torch.int64)
			
 
				+        for i, expert in enumerate(flat_experts):
			
 
				+            expert_indices = expert.uid[len(self.beam_search.uid_prefix):]
			
 
				+            expert_indices = list(map(int, expert_indices.split(UID_DELIMITER)))
			
 
				+            grid_indices[i] = torch.as_tensor(expert_indices, dtype=grid_indices.dtype)
			
 
				+
			
 
				+        scores_per_dim = [
			
 
				+            dim_scores[flat_batch_indices, dim_indices] if len(flat_batch_indices) else torch.zeros(0)
			
 
				+            for dim_scores, dim_indices in zip(grid_probs, grid_indices.T)]
			
 
				+        flat_scores = torch.prod(torch.stack(scores_per_dim, dim=0), dim=0)
			
 
				+
			
 
				+        scores = torch.full((batch_size, max_num_experts), fill_value=-float('inf'), device=grid_probs[0].device)
			
 
				+        scores[flat_batch_indices, flat_local_indices] = flat_scores  # backprop-able w.r.t. flat_scores
			
 
				+        return scores
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -91,7 +91,7 @@ class DHT(mp.Process):
 
				         """
			
 
				         self.start()
			
 
				         if await_ready and not self.ready.wait(timeout=timeout):
			
 
				-            raise TimeoutError("Server didn't notify .ready in {timeout} seconds")
			
 
				+            raise TimeoutError(f"Server didn't notify .ready in {timeout} seconds")
			
 
				 
			
 
				     def shutdown(self) -> None:
			
 
				         """ Shut down a running dht process """
			
@@ -186,6 +186,7 @@ class DHT(mp.Process):
 
				             else:
			
 
				                 future.set_result(await main_task)
			
 
				         except BaseException as e:
			
 
				+            logger.exception(f'Caught an exception when running a coroutine: {e}')
			
 
				             if not future.done():
			
 
				                 future.set_exception(e)
			
 
				 
			
@@ -243,7 +244,7 @@ class DHT(mp.Process):
 
				                                             f" Please ensure the node is connected or specify peers=... manually."))
			
 
				 
			
 
				     def declare_experts(self, uids, endpoint, wait: bool = True):
			
 
				-        logger.warning("dht.declare_experts is scheduled for removal in 0.9.8, please use hivemind.declare_experts.",)
			
 
				+        logger.warning("dht.declare_experts is scheduled for removal in 0.9.8, please use hivemind.declare_experts.")
			
 
				         return hivemind.declare_experts(self, uids, endpoint, wait=wait)
			
 
				 
			
 
				     def get_experts(self, uids, expiration_time: Optional[DHTExpiration] = None,
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -21,7 +21,7 @@ from hivemind.server.layers import name_to_block, name_to_input
 
				 from hivemind.server.layers import add_custom_models_from_file, schedule_name_to_scheduler
			
 
				 from hivemind.server.runtime import Runtime
			
 
				 from hivemind.server.task_pool import Task, TaskPool, TaskPoolBase
			
 
				-from hivemind.utils import Endpoint, get_port, replace_port, find_open_port, get_logger
			
 
				+from hivemind.utils import Endpoint, get_port, replace_port, find_open_port, get_logger, BatchTensorDescriptor
			
 
				 from hivemind.proto.runtime_pb2 import CompressionType
			
 
				 
			
 
				 logger = get_logger(__name__)
			
@@ -153,11 +153,11 @@ class Server(threading.Thread):
 
				         optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0)
			
 
				         device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
			
 
				 
			
 
				-        sample_input = name_to_input[expert_cls](4, hidden_dim)
			
 
				+        sample_input = name_to_input[expert_cls](3, hidden_dim)
			
 
				         if isinstance(sample_input, tuple):
			
 
				-            args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input)
			
 
				+            args_schema = tuple(BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input)
			
 
				         else:
			
 
				-            args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),)
			
 
				+            args_schema = (BatchTensorDescriptor.from_tensor(sample_input, compression),)
			
 
				 
			
 
				         scheduler = schedule_name_to_scheduler[scheduler]
			
 
				 
			
@@ -167,8 +167,6 @@ class Server(threading.Thread):
 
				             expert = name_to_block[expert_cls](hidden_dim)
			
 
				             experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
			
 
				                                                          args_schema=args_schema,
			
 
				-                                                         outputs_schema=hivemind.BatchTensorDescriptor(
			
 
				-                                                             hidden_dim, compression=compression),
			
 
				                                                          optimizer=optim_cls(expert.parameters()),
			
 
				                                                          scheduler=scheduler,
			
 
				                                                          num_warmup_steps=num_warmup_steps,
			
@@ -264,11 +262,15 @@ def background_server(*args, shutdown_timeout=5, **kwargs) -> Tuple[hivemind.End
 
				     """ A context manager that creates server in a background thread, awaits .ready on entry and shutdowns on exit """
			
 
				     pipe, runners_pipe = mp.Pipe(duplex=True)
			
 
				     runner = mp.Process(target=_server_runner, args=(runners_pipe, *args), kwargs=kwargs)
			
 
				-
			
 
				     try:
			
 
				         runner.start()
			
 
				-        yield pipe.recv()  # once the server is ready, runner will send us a tuple(hostname, port, dht port)
			
 
				-        pipe.send('SHUTDOWN')  # on exit from context, send shutdown signal
			
 
				+        # once the server is ready, runner will send us either (False, exception) or (True, (server_port, dht_port))
			
 
				+        start_ok, data = pipe.recv()
			
 
				+        if start_ok:
			
 
				+            yield data
			
 
				+            pipe.send('SHUTDOWN')  # on exit from context, send shutdown signal
			
 
				+        else:
			
 
				+            raise RuntimeError(f"Server failed to start: {data}")
			
 
				     finally:
			
 
				         runner.join(timeout=shutdown_timeout)
			
 
				         if runner.is_alive():
			
@@ -278,14 +280,21 @@ def background_server(*args, shutdown_timeout=5, **kwargs) -> Tuple[hivemind.End
 
				 
			
 
				 
			
 
				 def _server_runner(pipe, *args, **kwargs):
			
 
				-    server = Server.create(*args, start=True, **kwargs)
			
 
				+    try:
			
 
				+        server = Server.create(*args, start=True, **kwargs)
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"Encountered an exception when starting a server: {e}")
			
 
				+        pipe.send((False, f'{type(e).__name__} {e}'))
			
 
				+        return
			
 
				+
			
 
				     try:
			
 
				         if server.dht is not None:
			
 
				             dht_listen_on = hivemind.replace_port(server.dht.listen_on, server.dht.port)
			
 
				         else:
			
 
				             dht_listen_on = None
			
 
				-        pipe.send((server.listen_on, dht_listen_on))
			
 
				+        pipe.send((True, (server.listen_on, dht_listen_on)))
			
 
				         pipe.recv()  # wait for shutdown signal
			
 
				+
			
 
				     finally:
			
 
				         logger.info("Shutting down server...")
			
 
				         server.shutdown()
			
--- a/hivemind/server/layers/__init__.py
+++ b/hivemind/server/layers/__init__.py
@@ -1,12 +1,9 @@
 
				-import torch
			
 
				-
			
 
				 name_to_block = {}
			
 
				 name_to_input = {}
			
 
				 
			
 
				-from hivemind.server.layers.lr_schedule import get_linear_schedule_with_warmup
			
 
				-from hivemind.server.layers.custom_experts import add_custom_models_from_file, register_expert_class
			
 
				-
			
 
				 import hivemind.server.layers.common
			
 
				 import hivemind.server.layers.dropout
			
 
				+from hivemind.server.layers.custom_experts import add_custom_models_from_file, register_expert_class
			
 
				+from hivemind.server.layers.lr_schedule import get_linear_schedule_with_warmup
			
 
				 
			
 
				 schedule_name_to_scheduler = {'linear': get_linear_schedule_with_warmup, 'none': None}
			
--- a/hivemind/server/layers/common.py
+++ b/hivemind/server/layers/common.py
@@ -1,3 +1,5 @@
 
				+import time
			
 
				+
			
 
				 import torch
			
 
				 from torch import nn as nn
			
 
				 
			
@@ -11,6 +13,8 @@ def gelu_fast(x):
 
				 
			
 
				 
			
 
				 ffn_sample_input = lambda batch_size, hid_dim: torch.empty((batch_size, hid_dim))
			
 
				+
			
 
				+
			
 
				 @register_expert_class('ffn', ffn_sample_input)
			
 
				 class FeedforwardBlock(nn.Module):
			
 
				 
			
@@ -65,7 +69,9 @@ class TransformerEncoderLayer(nn.Module):
 
				 
			
 
				 transformer_sample_input = lambda batch_size, hid_dim: \
			
 
				     (torch.empty((batch_size, 128, hid_dim)), \
			
 
				-    torch.empty((batch_size, 128), dtype=torch.bool))
			
 
				+     torch.empty((batch_size, 128), dtype=torch.bool))
			
 
				+
			
 
				+
			
 
				 @register_expert_class('transformer', transformer_sample_input)
			
 
				 class TunedTransformer(TransformerEncoderLayer):
			
 
				 
			
@@ -74,6 +80,8 @@ class TunedTransformer(TransformerEncoderLayer):
 
				 
			
 
				 
			
 
				 nop_sample_input = lambda batch_size, hid_dim: torch.empty((batch_size, hid_dim))
			
 
				+
			
 
				+
			
 
				 @register_expert_class('nop', nop_sample_input)
			
 
				 class NopExpert(nn.Sequential):
			
 
				 
			
@@ -83,3 +91,16 @@ class NopExpert(nn.Sequential):
 
				 
			
 
				     def forward(self, x):
			
 
				         return x.clone()
			
 
				+
			
 
				+
			
 
				+@register_expert_class('nop_delay', nop_sample_input)
			
 
				+class DelayedNopExpert(nn.Sequential):
			
 
				+
			
 
				+    def __init__(self, hid_dim, delay=0.5):
			
 
				+        super().__init__()
			
 
				+        self.w = nn.Parameter(torch.zeros(0), requires_grad=True)
			
 
				+        self.delay = delay
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        time.sleep(self.delay)
			
 
				+        return x.clone()
			
--- a/hivemind/utils/tensor_descr.py
+++ b/hivemind/utils/tensor_descr.py
@@ -46,7 +46,7 @@ class TensorDescriptor(DescriptorBase):
 
				 
			
 
				 @dataclass(repr=True, frozen=True)
			
 
				 class BatchTensorDescriptor(TensorDescriptor):
			
 
				-    """ torch Tensor with a variable 0-th dimension, used to describe batched data """
			
 
				+    """ torch.Tensor with a variable 0-th dimension, used to describe batched data """
			
 
				 
			
 
				     def __init__(self, *instance_size, **kwargs):  # compatibility: allow initializing with *size
			
 
				         if len(instance_size) == 1 and isinstance(instance_size[0], (list, tuple, torch.Size)):
			
@@ -60,9 +60,9 @@ class BatchTensorDescriptor(TensorDescriptor):
 
				                    pin_memory=safe_check_pinned(tensor),
			
 
				                    compression=compression if tensor.is_floating_point() else CompressionType.NONE)
			
 
				 
			
 
				-    def make_empty(self, batch_size, **kwargs):
			
 
				+    def make_empty(self, *batch_size, **kwargs):
			
 
				         assert self.shape[0] is None, "Make sure 0-th dimension is not specified (set to None)"
			
 
				-        return super().make_empty(size=(batch_size, *self.shape[1:]), **kwargs)
			
 
				+        return super().make_empty(size=(*batch_size, *self.shape[1:]), **kwargs)
			
 
				 
			
 
				 
			
 
				 def safe_check_pinned(tensor: torch.Tensor) -> bool:
			
--- a/tests/custom_networks.py
+++ b/tests/custom_networks.py
@@ -4,11 +4,13 @@ import torch.nn.functional as F
 
				 
			
 
				 from hivemind.server.layers.custom_experts import register_expert_class
			
 
				 
			
 
				-sample_input = lambda batch_size, hidden_dim : torch.empty((batch_size, hidden_dim))
			
 
				+sample_input = lambda batch_size, hidden_dim: torch.empty((batch_size, hidden_dim))
			
 
				+
			
 
				+
			
 
				 @register_expert_class('perceptron', sample_input)
			
 
				 class MultilayerPerceptron(nn.Module):
			
 
				     def __init__(self, hidden_dim, num_classes=10):
			
 
				-        super(MultilayerPerceptron, self).__init__()
			
 
				+        super().__init__()
			
 
				         self.layer1 = nn.Linear(hidden_dim, 2 * hidden_dim)
			
 
				         self.layer2 = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
			
 
				         self.layer3 = nn.Linear(2 * hidden_dim, num_classes)
			
@@ -19,14 +21,17 @@ class MultilayerPerceptron(nn.Module):
 
				         x = self.layer3(x)
			
 
				         return x
			
 
				 
			
 
				-multihead_sample_input = lambda batch_size, hidden_dim : \
			
 
				+
			
 
				+multihead_sample_input = lambda batch_size, hidden_dim: \
			
 
				     (torch.empty((batch_size, hidden_dim)),
			
 
				-    torch.empty((batch_size, 2 * hidden_dim)),
			
 
				-    torch.empty((batch_size, 3 * hidden_dim)),)
			
 
				+     torch.empty((batch_size, 2 * hidden_dim)),
			
 
				+     torch.empty((batch_size, 3 * hidden_dim)),)
			
 
				+
			
 
				+
			
 
				 @register_expert_class('multihead', multihead_sample_input)
			
 
				 class MultiheadNetwork(nn.Module):
			
 
				     def __init__(self, hidden_dim, num_classes=10):
			
 
				-        super(MultiheadNetwork, self).__init__()
			
 
				+        super().__init__()
			
 
				         self.layer1 = nn.Linear(hidden_dim, num_classes)
			
 
				         self.layer2 = nn.Linear(2 * hidden_dim, num_classes)
			
 
				         self.layer3 = nn.Linear(3 * hidden_dim, num_classes)
			
--- a/tests/test_custom_expert.py
+++ b/tests/test_custom_expert.py
@@ -1,19 +1,17 @@
 
				 import os
			
 
				-import pytest
			
 
				-from typing import Optional
			
 
				 
			
 
				+import pytest
			
 
				 import torch
			
 
				 
			
 
				-import hivemind
			
 
				 from hivemind import RemoteExpert, background_server
			
 
				 
			
 
				+
			
 
				 @pytest.mark.forked
			
 
				-def test_custom_expert(port: Optional[int] = None, hid_dim=16):
			
 
				+def test_custom_expert(hid_dim=16):
			
 
				     with background_server(
			
 
				-        expert_cls='perceptron', num_experts=2, device='cpu',
			
 
				-        hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				-        custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				-
			
 
				+            expert_cls='perceptron', num_experts=2, device='cpu',
			
 
				+            hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				+            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				         expert0 = RemoteExpert('expert.0', server_endpoint)
			
 
				         expert1 = RemoteExpert('expert.1', server_endpoint)
			
 
				 
			
@@ -28,18 +26,19 @@ def test_custom_expert(port: Optional[int] = None, hid_dim=16):
 
				             loss = output1.sum()
			
 
				             loss.backward()
			
 
				 
			
 
				+
			
 
				 @pytest.mark.forked
			
 
				-def test_multihead_expert(port: Optional[int] = None, hid_dim=16):
			
 
				+def test_multihead_expert(hid_dim=16):
			
 
				     with background_server(
			
 
				-        expert_cls='multihead', num_experts=2, device='cpu',
			
 
				-        hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				-        custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				-
			
 
				+            expert_cls='multihead', num_experts=2, device='cpu',
			
 
				+            hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				+            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				         expert0 = RemoteExpert('expert.0', server_endpoint)
			
 
				         expert1 = RemoteExpert('expert.1', server_endpoint)
			
 
				 
			
 
				         for batch_size in (1, 4):
			
 
				-            batch = (torch.randn(batch_size, hid_dim), torch.randn(batch_size, 2 * hid_dim), torch.randn(batch_size, 3 * hid_dim))
			
 
				+            batch = (torch.randn(batch_size, hid_dim), torch.randn(batch_size, 2 * hid_dim),
			
 
				+                     torch.randn(batch_size, 3 * hid_dim))
			
 
				 
			
 
				             output0 = expert0(*batch)
			
 
				             output1 = expert1(*batch)
			
--- a/tests/test_dht_experts.py
+++ b/tests/test_dht_experts.py
@@ -77,7 +77,7 @@ def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peer
 
				 @pytest.mark.forked
			
 
				 def test_dht_single_node():
			
 
				     node = hivemind.DHT(start=True, expiration=999)
			
 
				-    beam_search = MoEBeamSearcher(node, 'expert.')
			
 
				+    beam_search = MoEBeamSearcher(node, 'expert.', grid_size=(10,))
			
 
				 
			
 
				     assert all(node.declare_experts(['expert.1', 'expert.2', 'expert.3'], f"{hivemind.LOCALHOST}:1337").values())
			
 
				     assert len(node.declare_experts(["ffn.1", "ffn.2"], endpoint="that_place")) == 4
			
@@ -104,7 +104,7 @@ def test_dht_single_node():
 
				     assert initial_beam[2][:2] == (0.0, 'expert.3.')
			
 
				 
			
 
				     with pytest.raises(AssertionError):
			
 
				-        beam_search = MoEBeamSearcher(node, 'expert.1.ffn')
			
 
				+        beam_search = MoEBeamSearcher(node, 'expert.1.ffn', (2, 2))
			
 
				 
			
 
				     with pytest.raises(AssertionError):
			
 
				         beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])
			
@@ -147,7 +147,7 @@ async def test_negative_caching():
 
				 
			
 
				     neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				     neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True)
			
 
				-    beam_search = MoEBeamSearcher(neg_caching_peer, uid_prefix='ffn.', negative_caching=True)
			
 
				+    beam_search = MoEBeamSearcher(neg_caching_peer, uid_prefix='ffn.', grid_size=(10, 10, 10), negative_caching=True)
			
 
				     # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
			
 
				     assert len(beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2
			
 
				 
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -18,13 +18,30 @@ def test_moe():
 
				         dht = hivemind.DHT(start=True, expiration=999, initial_peers=[dht_endpoint])
			
 
				 
			
 
				         dmoe = hivemind.RemoteMixtureOfExperts(
			
 
				-            in_features=16, grid_size=(32, 32, 32), dht=dht, k_best=3, uid_prefix='ffn.')
			
 
				+            in_features=16, grid_size=(4, 4, 4), dht=dht, k_best=3, uid_prefix='ffn.')
			
 
				 
			
 
				-        for i in range(5):
			
 
				+        for i in range(3):
			
 
				             out = dmoe(torch.randn(10, 16))
			
 
				             out.sum().backward()
			
 
				 
			
 
				 
			
 
				+@pytest.mark.forked
			
 
				+def test_no_experts():
			
 
				+    all_expert_uids = [f'expert.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}'
			
 
				+                       for _ in range(10)]
			
 
				+    with background_server(expert_uids=all_expert_uids, device='cpu', expert_cls='nop_delay', num_handlers=1,
			
 
				+                           hidden_dim=16) as (server_endpoint, dht_endpoint):
			
 
				+        dht = hivemind.DHT(start=True, expiration=999, initial_peers=[dht_endpoint])
			
 
				+
			
 
				+        dmoe = hivemind.RemoteSwitchMixtureOfExperts(
			
 
				+            in_features=16, grid_size=(4, 4, 4), dht=dht, uid_prefix='expert.', forward_timeout=0.1,
			
 
				+            backward_timeout=0.1, allow_zero_outputs=True)
			
 
				+
			
 
				+        for i in range(3):
			
 
				+            out, balancing_loss = dmoe(torch.randn(10, 16))
			
 
				+            out.sum().backward()
			
 
				+
			
 
				+
			
 
				 @pytest.mark.forked
			
 
				 def test_call_many(hidden_dim=16):
			
 
				     k_min = 1
			
@@ -33,6 +50,7 @@ def test_call_many(hidden_dim=16):
 
				     forward_timeout = None
			
 
				     backward_timeout = None
			
 
				     detect_anomalies = False
			
 
				+    allow_zero_outputs = False
			
 
				     atol = 1e-5
			
 
				 
			
 
				     with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=hidden_dim,
			
@@ -44,7 +62,7 @@ def test_call_many(hidden_dim=16):
 
				 
			
 
				         mask, expert_outputs = hivemind.client.moe._RemoteCallMany.apply(
			
 
				             DUMMY, [[e0, e1, e2], [e2, e4], [e1, e5, e3], []], k_min, backward_k_min, timeout_after_k_min,
			
 
				-            forward_timeout, backward_timeout, detect_anomalies, e1.info, inputs
			
 
				+            forward_timeout, backward_timeout, detect_anomalies, allow_zero_outputs, e1.info, inputs
			
 
				         )
			
 
				         assert mask.shape == (4, 3)
			
 
				         assert expert_outputs.shape == (4, 3, hidden_dim)
			
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,13 +1,14 @@
 
				+import time
			
 
				 from functools import partial
			
 
				 
			
 
				-import time
			
 
				 import pytest
			
 
				 import torch
			
 
				 import torch.nn as nn
			
 
				 import torch.nn.functional as F
			
 
				 from sklearn.datasets import load_digits
			
 
				 
			
 
				-from hivemind import RemoteExpert, background_server, DHT, DecentralizedSGD
			
 
				+from hivemind import RemoteExpert, RemoteMixtureOfExperts, RemoteSwitchMixtureOfExperts, background_server, DHT, \
			
 
				+    DecentralizedSGD
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
@@ -22,20 +23,91 @@ def test_training(max_steps: int = 100, threshold: float = 0.9):
 
				         expert2 = RemoteExpert('expert.1', server_endpoint)
			
 
				         model = nn.Sequential(expert2, nn.ReLU(), expert1, nn.Linear(64, 2))
			
 
				 
			
 
				-        opt = torch.optim.SGD(model.parameters(), lr=0.05)
			
 
				+        opt = SGD(model.parameters(), lr=0.05)
			
 
				 
			
 
				         for step in range(max_steps):
			
 
				+            outputs = model(X_train)
			
 
				+            loss = F.cross_entropy(outputs, y_train)
			
 
				+            loss.backward()
			
 
				+            opt.step()
			
 
				             opt.zero_grad()
			
 
				 
			
 
				+            accuracy = (outputs.argmax(dim=1) == y_train).float().mean().item()
			
 
				+            if accuracy >= threshold:
			
 
				+                break
			
 
				+
			
 
				+        assert accuracy >= threshold, f"too small accuracy: {accuracy}"
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_moe_training(max_steps: int = 100, threshold: float = 0.9, num_experts=2):
			
 
				+    dataset = load_digits(n_class=2)
			
 
				+    X_train, y_train = torch.tensor(dataset['data'], dtype=torch.float), torch.tensor(dataset['target'])
			
 
				+    SGD = partial(torch.optim.SGD, lr=0.05)
			
 
				+
			
 
				+    all_expert_uids = [f'expert.{i}' for i in range(num_experts)]
			
 
				+    with background_server(expert_uids=all_expert_uids, device='cpu', optim_cls=SGD, hidden_dim=64, num_handlers=1) \
			
 
				+            as (server_endpoint, dht_endpoint):
			
 
				+        dht = DHT(start=True, expiration=999, initial_peers=[dht_endpoint])
			
 
				+
			
 
				+        moe = RemoteMixtureOfExperts(in_features=64, grid_size=(num_experts,), dht=dht, uid_prefix='expert.', k_best=2)
			
 
				+        model = nn.Sequential(moe, nn.Linear(64, 2))
			
 
				+
			
 
				+        opt = SGD(model.parameters(), lr=0.05)
			
 
				+
			
 
				+        for step in range(max_steps):
			
 
				             outputs = model(X_train)
			
 
				             loss = F.cross_entropy(outputs, y_train)
			
 
				             loss.backward()
			
 
				             opt.step()
			
 
				+            opt.zero_grad()
			
 
				+
			
 
				+            accuracy = (outputs.argmax(dim=1) == y_train).float().mean().item()
			
 
				+            if accuracy >= threshold:
			
 
				+                break
			
 
				+
			
 
				+        assert accuracy >= threshold, f"too small accuracy: {accuracy}"
			
 
				+
			
 
				+
			
 
				+class SwitchNetwork(nn.Module):
			
 
				+    def __init__(self, dht, in_features, num_classes, num_experts):
			
 
				+        super().__init__()
			
 
				+        self.moe = RemoteSwitchMixtureOfExperts(in_features=in_features, grid_size=(num_experts,), dht=dht,
			
 
				+                                                jitter_eps=0, uid_prefix='expert.', k_best=1,
			
 
				+                                                k_min=1)
			
 
				+        self.linear = nn.Linear(in_features, num_classes)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        moe_output, balancing_loss = self.moe(x)
			
 
				+        return self.linear(moe_output), balancing_loss
			
 
				+
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+def test_switch_training(max_steps: int = 10, threshold: float = 0.9, num_experts=5):
			
 
				+    dataset = load_digits(n_class=2)
			
 
				+    X_train, y_train = torch.tensor(dataset['data'], dtype=torch.float), torch.tensor(dataset['target'])
			
 
				+    SGD = partial(torch.optim.SGD, lr=0.05)
			
 
				+
			
 
				+    all_expert_uids = [f'expert.{i}' for i in range(num_experts)]
			
 
				+    with background_server(expert_uids=all_expert_uids, device='cpu', optim_cls=SGD, hidden_dim=64,
			
 
				+                           num_handlers=1) as (server_endpoint, dht_endpoint):
			
 
				+        dht = DHT(start=True, expiration=999, initial_peers=[dht_endpoint])
			
 
				+
			
 
				+        model = SwitchNetwork(dht, 64, 2, num_experts)
			
 
				+        opt = SGD(model.parameters(), lr=0.05)
			
 
				+
			
 
				+        for step in range(max_steps):
			
 
				+            outputs, balancing_loss = model(X_train)
			
 
				+            loss = F.cross_entropy(outputs, y_train) + 0.01 * balancing_loss
			
 
				+            loss.backward()
			
 
				+            opt.step()
			
 
				+            opt.zero_grad()
			
 
				 
			
 
				             accuracy = (outputs.argmax(dim=1) == y_train).float().mean().item()
			
 
				             if accuracy >= threshold:
			
 
				                 break
			
 
				 
			
 
				+        assert model.moe.grid_utilization.min().item() > (1 / num_experts) / 2
			
 
				         assert accuracy >= threshold, f"too small accuracy: {accuracy}"