3 years ago · 724cdfe5e2
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -12,7 +12,7 @@ jobs:
 
															     strategy:
														
 
															       matrix:
														
 
															         python-version: [ 3.7, 3.8, 3.9 ]
														
 
															-    timeout-minutes: 12
														
 
															+    timeout-minutes: 15
														
 
															     steps:
														
 
															       - uses: actions/checkout@v2
														
 
															       - name: Set up Python
														
@@ -71,7 +71,7 @@ jobs:
 
															   codecov_in_develop_mode:
														
 
															     runs-on: ubuntu-latest
														
 
															-    timeout-minutes: 12
														
 
															+    timeout-minutes: 15
														
 
															     steps:
														
 
															       - uses: actions/checkout@v2
														
 
															       - name: Set up Python
														
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,12 +6,14 @@ import time
 
															 import torch
														
 
															-from hivemind.moe.client import RemoteExpert
														
 
															+from hivemind.dht import DHT
														
 
															+from hivemind.moe.client.expert import RemoteExpert, RemoteExpertInfo
														
 
															+from hivemind.moe.client.remote_expert_worker import RemoteExpertWorker
														
 
															 from hivemind.moe.server import ExpertBackend, Server
														
 
															 from hivemind.moe.server.layers import name_to_block
														
 
															+from hivemind.p2p import P2P, PeerInfo
														
 
															 from hivemind.utils.limits import increase_file_limit
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															-from hivemind.utils.networking import LOCALHOST, get_free_port
														
 
															 from hivemind.utils.tensor_descr import BatchTensorDescriptor
														
 
															 use_hivemind_log_handler("in_root_logger")
														
@@ -31,14 +33,30 @@ def print_device_info(device=None):
 
															         logger.info(f"Cached:   {round(torch.cuda.memory_cached(0) / 1024 ** 3, 1)} GB")
														
 
															-def client_process(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches, backprop=True):
														
 
															+def client_process(
														
 
															+    can_start,
														
 
															+    benchmarking_failed,
														
 
															+    server_maddrs,
														
 
															+    server_peer_id,
														
 
															+    num_experts,
														
 
															+    batch_size,
														
 
															+    hid_dim,
														
 
															+    num_batches,
														
 
															+    backprop=True,
														
 
															+) -> None:
														
 
															     torch.set_num_threads(1)
														
 
															     can_start.wait()
														
 
															-    experts = [RemoteExpert(f"expert{i}", endpoint=f"{LOCALHOST}:{port}") for i in range(num_experts)]
														
 
															+
														
 
															+    p2p = RemoteExpertWorker.run_coroutine(P2P.create(initial_peers=server_maddrs))
														
 
															+    peer_info = PeerInfo(server_peer_id, server_maddrs)
														
 
															+    experts = [
														
 
															+        RemoteExpert(expert_info=RemoteExpertInfo(uid=f"expert.{i}", peer_info=peer_info), p2p=p2p)
														
 
															+        for i in range(num_experts)
														
 
															+    ]
														
 
															     try:
														
 
															         dummy_batch = torch.randn(batch_size, hid_dim)
														
 
															-        for batch_i in range(num_batches):
														
 
															+        for _ in range(num_batches):
														
 
															             expert = random.choice(experts)
														
 
															             out = expert(dummy_batch)
														
 
															             if backprop:
														
@@ -59,7 +77,6 @@ def benchmark_throughput(
 
															     max_batch_size=None,
														
 
															     backprop=True,
														
 
															     device=None,
														
 
															-    port=None,
														
 
															 ):
														
 
															     assert (
														
 
															         not hasattr(torch.cuda, "is_initialized")
														
@@ -67,7 +84,6 @@ def benchmark_throughput(
 
															         or torch.device(device) == torch.device("cpu")
														
 
															     )
														
 
															     assert expert_cls in name_to_block
														
 
															-    port = port or get_free_port()
														
 
															     max_batch_size = max_batch_size or batch_size * 4
														
 
															     num_handlers = max(1, num_handlers or num_clients // 2)
														
 
															     benchmarking_failed = mp.Event()
														
@@ -75,8 +91,7 @@ def benchmark_throughput(
 
															     timestamps = dict(started=time.perf_counter())
														
 
															     try:
														
 
															-        # start clients and await server
														
 
															-        # Note: client processes must be launched BEFORE touching gpu, even torch.cuda.is_available can cause trouble
														
 
															+        server_dht = DHT(start=True)
														
 
															         clients = [
														
 
															             mp.Process(
														
 
															                 target=client_process,
														
@@ -84,30 +99,30 @@ def benchmark_throughput(
 
															                 args=(
														
 
															                     can_start,
														
 
															                     benchmarking_failed,
														
 
															-                    port,
														
 
															+                    server_dht.get_visible_maddrs(),
														
 
															+                    server_dht.peer_id,
														
 
															                     num_experts,
														
 
															                     batch_size,
														
 
															                     hid_dim,
														
 
															                     num_batches_per_client,
														
 
															                     backprop,
														
 
															                 ),
														
 
															+                daemon=True,
														
 
															             )
														
 
															             for i in range(num_clients)
														
 
															         ]
														
 
															         for client in clients:
														
 
															-            client.daemon = True
														
 
															             client.start()
														
 
															         timestamps["launched_clients"] = timestamps["began_launching_server"] = time.perf_counter()
														
 
															-        # start server
														
 
															         device = device or ("cuda" if torch.cuda.is_available() else "cpu")
														
 
															         experts = {}
														
 
															         for i in range(num_experts):
														
 
															             expert = torch.jit.script(name_to_block[expert_cls](hid_dim))
														
 
															-            experts[f"expert{i}"] = ExpertBackend(
														
 
															-                name=f"expert{i}",
														
 
															+            experts[f"expert.{i}"] = ExpertBackend(
														
 
															+                name=f"expert.{i}",
														
 
															                 expert=expert,
														
 
															                 optimizer=torch.optim.Adam(expert.parameters()),
														
 
															                 args_schema=(BatchTensorDescriptor(hid_dim),),
														
@@ -115,21 +130,24 @@ def benchmark_throughput(
 
															                 max_batch_size=max_batch_size,
														
 
															             )
														
 
															         timestamps["created_experts"] = time.perf_counter()
														
 
															+
														
 
															         server = Server(
														
 
															-            None,
														
 
															-            experts,
														
 
															-            listen_on=f"{LOCALHOST}:{port}",
														
 
															+            dht=server_dht,
														
 
															+            expert_backends=experts,
														
 
															             num_connection_handlers=num_handlers,
														
 
															             device=device,
														
 
															         )
														
 
															         server.start()
														
 
															         server.ready.wait()
														
 
															+
														
 
															         timestamps["server_ready"] = time.perf_counter()
														
 
															         can_start.set()
														
 
															         for client in clients:
														
 
															             client.join()
														
 
															+
														
 
															         timestamps["clients_finished"] = time.perf_counter()
														
 
															+
														
 
															     except BaseException as e:
														
 
															         benchmarking_failed.set()
														
 
															         raise e
														
--- a/docs/user/moe.md
+++ b/docs/user/moe.md
@@ -1,7 +1,7 @@
 
															 # Mixture-of-Experts
														
 
															 This tutorial covers the basics of Decentralized Mixture-of-Experts (DMoE).
														
 
															-From the infrastructure standpoint, DMoE consists of two parts: experts hosted on peer devices, and a gating/routing function that assigns input to one of these experts.
														
 
															+From the infrastructure standpoint, DMoE consists of two parts: experts hosted on peer devices, and client-side modules to access those experts.
														
 
															 ## Host experts with a server
														
@@ -11,9 +11,8 @@ most of the model parameters and computation. The server can be started using ei
 
															 for now. To host a server with default experts, run this in your shell:
														
 
															 ```sh
														
 
															-hivemind-server --expert_cls ffn --hidden_dim 512 --num_experts 5 --expert_pattern "expert.[0:5]" \
														
 
															-                --listen_on 0.0.0.0:1337
														
 
															-# note: if you omit listen_on and/or dht_port, they will be chosen automatically and printed to stdout.
														
 
															+hivemind-server --expert_cls ffn --hidden_dim 512 --num_experts 5 --expert_pattern "expert.[0:5]"
														
 
															+# note: server will listen to a random port. To specify interface & port, add --host_maddrs and --announce_maddrs
														
 
															 ```
														
 
															 <details style="margin-top:-24px; margin-bottom: 16px;">
														
@@ -22,8 +21,7 @@ hivemind-server --expert_cls ffn --hidden_dim 512 --num_experts 5 --expert_patte
 
															 ```sh
														
 
															 [2021/07/15 18:52:01.424][INFO][moe.server.create:156] Running DHT node on ['/ip4/127.0.0.1/tcp/42513/p2p/QmacLgRkAHSqdWYdQ8TePioMxQCNV2JeD3AUDmbVd69gNL'], initial peers = []
														
 
															 [2021/07/15 18:52:01.424][INFO][moe.server.create:181] Generating 5 expert uids from pattern expert.[0:5]
														
 
															-[2021/07/15 18:52:01.658][INFO][moe.server.run:233] Server started at 0.0.0.0:1337
														
 
															-[2021/07/15 18:52:01.658][INFO][moe.server.run:234] Got 5 experts:
														
 
															+[2021/07/15 18:52:01.658][INFO][moe.server.run:233] Server started with 5 experts
														
 
															 [2021/07/15 18:52:01.658][INFO][moe.server.run:237] expert.4: FeedforwardBlock, 2100736 parameters
														
 
															 [2021/07/15 18:52:01.658][INFO][moe.server.run:237] expert.0: FeedforwardBlock, 2100736 parameters
														
 
															 [2021/07/15 18:52:01.659][INFO][moe.server.run:237] expert.3: FeedforwardBlock, 2100736 parameters
														
@@ -67,8 +65,7 @@ hivemind-server --expert_cls ffn --hidden_dim 512 --num_experts 10 --expert_patt
 
															 ```sh
														
 
															 [2021/07/15 18:53:41.700][INFO][moe.server.create:156] Running DHT node on ['/ip4/127.0.0.1/tcp/34487/p2p/QmcJ3jgbdwphLAiwGjvwrjimJJrdMyhLHf6tFj9viCFFGn'], initial peers = ['/ip4/127.0.0.1/tcp/42513/p2p/QmacLgRkAHSqdWYdQ8TePioMxQCNV2JeD3AUDmbVd69gNL']
														
 
															 [2021/07/15 18:53:41.700][INFO][moe.server.create:181] Generating 10 expert uids from pattern expert.[5:250]
														
 
															-[2021/07/15 18:53:42.085][INFO][moe.server.run:233] Server started at 0.0.0.0:36389
														
 
															-[2021/07/15 18:53:42.086][INFO][moe.server.run:234] Got 10 experts:
														
 
															+[2021/07/15 18:53:42.085][INFO][moe.server.run:233] Server started with 10 experts:
														
 
															 [2021/07/15 18:53:42.086][INFO][moe.server.run:237] expert.55: FeedforwardBlock, 2100736 parameters
														
 
															 [2021/07/15 18:53:42.086][INFO][moe.server.run:237] expert.173: FeedforwardBlock, 2100736 parameters
														
 
															 [2021/07/15 18:53:42.086][INFO][moe.server.run:237] expert.164: FeedforwardBlock, 2100736 parameters
														
@@ -104,10 +101,10 @@ hivemind-server --expert_cls ffn --hidden_dim 512 --num_experts 10 --expert_patt
 
															 </details>
														
 
															-By default, the server will only accept connections from your local machine. To access it globally, you should replace
														
 
															-`127.0.0.1` part from initial peers with server's IP address. Hivemind supports both ipv4 and ipv6 protocols and uses the same notation
														
 
															-as [libp2p](https://docs.libp2p.io/concepts/addressing/). You can find more details on multiaddresses in the 
														
 
															-[DHT tutorial](https://learning-at-home.readthedocs.io/en/latest/user/dht.html).
														
 
															+By default, the server will only accept connections from your local network. 
														
 
															+To enable training over the Internet (or some other network), you should set `--host_maddrs` and `--announce_maddrs`.
														
 
															+These options also allow you to select IPv4/IPv6 network protocols and TCP and QUIC transport protocols.
														
 
															+You can find more details in the [DHT tutorial](https://learning-at-home.readthedocs.io/en/latest/user/dht.html).
														
 
															 ## Train the experts
														
--- a/hivemind/averaging/averager.py
+++ b/hivemind/averaging/averager.py
@@ -37,8 +37,8 @@ from hivemind.utils.asyncio import (
 
															     enter_asynchronously,
														
 
															     switch_to_uvloop,
														
 
															 )
														
 
															-from hivemind.utils.grpc import combine_from_streaming, split_for_streaming
														
 
															 from hivemind.utils.serializer import MSGPackSerializer, SerializerBase
														
 
															+from hivemind.utils.streaming import combine_from_streaming, split_for_streaming
														
 
															 from hivemind.utils.timed_storage import DHTExpiration, ValueWithExpiration, get_dht_time
														
 
															 # flavour types
														
@@ -709,6 +709,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
															                         stream = await stub.rpc_download_state(averaging_pb2.DownloadRequest())
														
 
															                         current_tensor_parts, tensors = [], []
														
 
															+                        # TODO merge this with hivemind.compression.deserialize_tensor_stream
														
 
															                         async for message in aiter_with_timeout(stream, timeout=timeout):
														
 
															                             if message.metadata:
														
 
															                                 metadata = self.serializer.loads(message.metadata)
														
--- a/hivemind/compression/__init__.py
+++ b/hivemind/compression/__init__.py
@@ -6,4 +6,8 @@ from hivemind.compression.adaptive import PerTensorCompression, RoleAdaptiveComp
 
															 from hivemind.compression.base import CompressionBase, CompressionInfo, NoCompression, TensorRole
														
 
															 from hivemind.compression.floating import Float16Compression, ScaledFloat16Compression
														
 
															 from hivemind.compression.quantization import Quantile8BitQuantization, Uniform8BitQuantization
														
 
															-from hivemind.compression.serialization import deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.compression.serialization import (
														
 
															+    deserialize_tensor_stream,
														
 
															+    deserialize_torch_tensor,
														
 
															+    serialize_torch_tensor,
														
 
															+)
														
--- a/hivemind/compression/adaptive.py
+++ b/hivemind/compression/adaptive.py
@@ -3,8 +3,8 @@ from typing import Mapping, Sequence, Union
 
															 import torch
														
 
															-import hivemind
														
 
															 from hivemind.compression.base import CompressionBase, CompressionInfo, Key, NoCompression, TensorRole
														
 
															+from hivemind.compression.serialization import deserialize_torch_tensor
														
 
															 from hivemind.proto import runtime_pb2
														
@@ -20,7 +20,7 @@ class AdaptiveCompressionBase(CompressionBase, ABC):
 
															         return self.choose_compression(info).compress(tensor, info=info, allow_inplace=allow_inplace)
														
 
															     def extract(self, serialized_tensor: runtime_pb2.Tensor) -> torch.Tensor:
														
 
															-        return hivemind.compression.deserialize_torch_tensor(serialized_tensor)
														
 
															+        return deserialize_torch_tensor(serialized_tensor)
														
 
															 class SizeAdaptiveCompression(AdaptiveCompressionBase):
														
--- a/hivemind/compression/base.py
+++ b/hivemind/compression/base.py
@@ -80,7 +80,7 @@ class NoCompression(CompressionBase):
 
															     compression_type = runtime_pb2.CompressionType.NONE
														
 
															     def compress(self, tensor: torch.Tensor, info: CompressionInfo, allow_inplace: bool = False) -> runtime_pb2.Tensor:
														
 
															-        array = tensor.numpy()
														
 
															+        array = tensor.detach().numpy()
														
 
															         return runtime_pb2.Tensor(
														
 
															             compression=self.compression_type,
														
 
															             buffer=array.tobytes(),
														
--- a/hivemind/compression/serialization.py
+++ b/hivemind/compression/serialization.py
@@ -1,4 +1,6 @@
 
															-from typing import Dict, Optional
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from typing import AsyncIterator, Dict, Iterable, List, Optional
														
 
															 import torch
														
@@ -6,6 +8,7 @@ from hivemind.compression.base import CompressionBase, CompressionInfo, NoCompre
 
															 from hivemind.compression.floating import Float16Compression, ScaledFloat16Compression
														
 
															 from hivemind.compression.quantization import Quantile8BitQuantization, Uniform8BitQuantization
														
 
															 from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils.streaming import combine_from_streaming
														
 
															 BASE_COMPRESSION_TYPES: Dict[str, CompressionBase] = dict(
														
 
															     NONE=NoCompression(),
														
@@ -41,3 +44,24 @@ def deserialize_torch_tensor(serialized_tensor: runtime_pb2.Tensor) -> torch.Ten
 
															     """Restore a pytorch tensor from a protobuf message"""
														
 
															     compression = BASE_COMPRESSION_TYPES[runtime_pb2.CompressionType.Name(serialized_tensor.compression)]
														
 
															     return compression.extract(serialized_tensor).requires_grad_(serialized_tensor.requires_grad)
														
 
															+
														
 
															+
														
 
															+async def deserialize_tensor_stream(
														
 
															+    stream: AsyncIterator[Iterable[runtime_pb2.Tensor]],
														
 
															+) -> List[torch.Tensor]:
														
 
															+    """Async wrapper of combine_from_streaming that combines tensors from a stream of parts and deserializes them"""
														
 
															+
														
 
															+    tensors = []
														
 
															+    tensor_parts = []
														
 
															+
														
 
															+    async for parts in stream:
														
 
															+        for part in parts:
														
 
															+            if part.dtype and tensor_parts:
														
 
															+                tensors.append(deserialize_torch_tensor(combine_from_streaming(tensor_parts)))
														
 
															+                tensor_parts = []
														
 
															+
														
 
															+            tensor_parts.append(part)
														
 
															+    if tensor_parts:
														
 
															+        tensors.append(deserialize_torch_tensor(combine_from_streaming(tensor_parts)))
														
 
															+
														
 
															+    return tensors
														
--- a/hivemind/dht/dht.py
+++ b/hivemind/dht/dht.py
@@ -55,6 +55,7 @@ class DHT(mp.Process):
 
															         **kwargs,
														
 
															     ):
														
 
															         self._parent_pid = os.getpid()
														
 
															+        self._origin_pid = os.getpid()
														
 
															         super().__init__()
														
 
															         if not (
														
@@ -309,8 +310,8 @@ class DHT(mp.Process):
 
															         Get a replica of a P2P instance used in the DHT process internally.
														
 
															         The replica uses the same P2P daemon as the DHT and only works while DHT is alive.
														
 
															         """
														
 
															-
														
 
															-        if self._p2p_replica is None:
														
 
															+        if self._p2p_replica is None or self._origin_pid != os.getpid():
														
 
															+            self._origin_pid = os.getpid()
														
 
															             daemon_listen_maddr = self.run_coroutine(DHT._get_p2p_daemon_listen_maddr)
														
 
															             self._p2p_replica = await P2P.replicate(daemon_listen_maddr)
														
 
															         return self._p2p_replica
														
--- a/hivemind/hivemind_cli/config.yml
+++ b/hivemind/hivemind_cli/config.yml
@@ -1,4 +1,3 @@
 
															-listen_on: 0.0.0.0:*
														
 
															 num_experts: 16
														
 
															 expert_cls: ffn
														
 
															 hidden_dim: 1024
														
--- a/hivemind/hivemind_cli/run_server.py
+++ b/hivemind/hivemind_cli/run_server.py
@@ -18,8 +18,7 @@ def main():
 
															     # fmt:off
														
 
															     parser = configargparse.ArgParser(default_config_files=["config.yml"])
														
 
															     parser.add('-c', '--config', required=False, is_config_file=True, help='config file path')
														
 
															-    parser.add_argument('--listen_on', type=str, default='0.0.0.0:*', required=False,
														
 
															-                        help="'localhost' for local connections only, '0.0.0.0' for ipv4 '[::]' for ipv6")
														
 
															+
														
 
															     parser.add_argument('--num_experts', type=int, default=None, required=False, help="The number of experts to serve")
														
 
															     parser.add_argument('--expert_pattern', type=str, default=None, required=False,
														
 
															                         help='all expert uids will follow this pattern, e.g. "myexpert.[0:256].[0:1024]" will'
														
@@ -32,6 +31,11 @@ def main():
 
															                         help="expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'")
														
 
															     parser.add_argument('--hidden_dim', type=int, default=1024, required=False, help='main dimension for expert_cls')
														
 
															+    parser.add_argument('--host_maddrs', type=list, nargs='+', default=['/ip4/0.0.0.0/tcp/0'], required=False,
														
 
															+                        help='Multiaddrs to listen for external connections from other p2p instances; default: all IPv4 and TCP: /ip4/0.0.0.0/tcp/0')
														
 
															+    parser.add_argument('--announce_maddrs', type=list, nargs='+', default=None, required=False,
														
 
															+                        help='Visible multiaddrs the host announces for external connections from other p2p instances')
														
 
															+
														
 
															     parser.add_argument('--num_handlers', type=int, default=None, required=False,
														
 
															                         help='server will use this many processes to handle incoming requests')
														
 
															     parser.add_argument('--min_batch_size', type=int, default=1,
														
@@ -49,7 +53,6 @@ def main():
 
															     parser.add_argument('--num_total_steps', type=int, required=False, help='The total number of steps for LR schedule')
														
 
															     parser.add_argument('--clip_grad_norm', type=float, required=False, help='Maximum gradient norm used for clipping')
														
 
															-    parser.add_argument('--no_dht', action='store_true', help='if specified, the server will not be attached to a dht')
														
 
															     parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[],
														
 
															                         help='multiaddrs of one or more active DHT peers (if you want to join an existing DHT)')
														
 
															     parser.add_argument('--increase_file_limit', action='store_true',
														
--- a/hivemind/moe/client/beam_search.py
+++ b/hivemind/moe/client/beam_search.py
@@ -5,7 +5,12 @@ from functools import partial
 
															 from typing import Deque, Dict, Iterator, List, Optional, Sequence, Set, Tuple, Union
														
 
															 from hivemind.dht import DHT, DHTExpiration, DHTNode
														
 
															-from hivemind.moe.client.expert import RemoteExpert
														
 
															+from hivemind.moe.client.expert import (
														
 
															+    RemoteExpert,
														
 
															+    RemoteExpertInfo,
														
 
															+    batch_create_remote_experts,
														
 
															+    create_remote_experts,
														
 
															+)
														
 
															 from hivemind.moe.server.expert_uid import (
														
 
															     FLAT_EXPERT,
														
 
															     PREFIX_PATTERN,
														
@@ -17,6 +22,7 @@ from hivemind.moe.server.expert_uid import (
 
															     UidEndpoint,
														
 
															     is_valid_prefix,
														
 
															 )
														
 
															+from hivemind.p2p import PeerInfo
														
 
															 from hivemind.utils import MPFuture, get_dht_time, get_logger
														
 
															 logger = get_logger(__name__)
														
@@ -145,7 +151,7 @@ class MoEBeamSearcher:
 
															                 maybe_prefix_data = await pending_task
														
 
															                 if maybe_prefix_data is not None and isinstance(maybe_prefix_data.value, dict):
														
 
															                     successors = {
														
 
															-                        coord: UidEndpoint(*match.value)
														
 
															+                        coord: UidEndpoint(uid=match.value[0], peer_info=PeerInfo.from_tuple(match.value[1]))
														
 
															                         for coord, match in maybe_prefix_data.value.items()
														
 
															                         if isinstance(coord, Coordinate)
														
 
															                         and isinstance(getattr(match, "value", None), list)
														
@@ -212,7 +218,7 @@ class MoEBeamSearcher:
 
															         for prefix, found in dht_responses.items():
														
 
															             if found and isinstance(found.value, dict):
														
 
															                 successors[prefix] = {
														
 
															-                    coord: UidEndpoint(*match.value)
														
 
															+                    coord: UidEndpoint(uid=match.value[0], peer_info=PeerInfo.from_tuple(match.value[1]))
														
 
															                     for coord, match in found.value.items()
														
 
															                     if isinstance(coord, Coordinate)
														
 
															                     and 0 <= coord < grid_size
														
@@ -230,7 +236,7 @@ class MoEBeamSearcher:
 
															     def find_best_experts(
														
 
															         self, grid_scores: Sequence[Sequence[float]], beam_size: int, return_future: bool = False
														
 
															-    ) -> Union[List[RemoteExpert], MPFuture[RemoteExpert]]:
														
 
															+    ) -> Union[List[RemoteExpert], MPFuture[List[RemoteExpert]]]:
														
 
															         """
														
 
															         Find and return :beam_size: active experts with highest scores, use both local cache and DHT
														
@@ -245,7 +251,7 @@ class MoEBeamSearcher:
 
															         :returns: a list that contains *up to* k_best RemoteExpert instances
														
 
															         """
														
 
															         assert len(grid_scores) == len(self.grid_size) and beam_size > 0
														
 
															-        return self.dht.run_coroutine(
														
 
															+        result = self.dht.run_coroutine(
														
 
															             partial(
														
 
															                 self._find_best_experts,
														
 
															                 prefix=self.uid_prefix,
														
@@ -258,6 +264,8 @@ class MoEBeamSearcher:
 
															             return_future,
														
 
															         )
														
 
															+        return create_remote_experts(result, self.dht, return_future)
														
 
															+
														
 
															     @classmethod
														
 
															     async def _find_best_experts(
														
 
															         cls,
														
@@ -269,7 +277,7 @@ class MoEBeamSearcher:
 
															         negative_caching: bool,
														
 
															         cache_expiration: DHTExpiration,
														
 
															         num_workers: Optional[int] = None,
														
 
															-    ) -> List[RemoteExpert]:
														
 
															+    ) -> List[RemoteExpertInfo]:
														
 
															         num_workers = num_workers or min(beam_size, dht.num_workers or beam_size)
														
 
															         # form initial beam from top-k active L1 prefixes, each row is (score, uid prefix, possible suffixes)
														
@@ -322,7 +330,10 @@ class MoEBeamSearcher:
 
															                 push_and_maybe_pop(best_experts_heap, (score, uid_endpoint))
														
 
															                 unique_experts.add(uid_endpoint.uid)
														
 
															-        best_experts = [RemoteExpert(*uid_endpoint) for score, uid_endpoint in sorted(best_experts_heap, reverse=True)]
														
 
															+        best_experts = [
														
 
															+            RemoteExpertInfo(uid_endpoint.uid, uid_endpoint.peer_info)
														
 
															+            for _, uid_endpoint in sorted(best_experts_heap, reverse=True)
														
 
															+        ]
														
 
															         return best_experts
														
 
															     @staticmethod
														
@@ -351,7 +362,7 @@ class MoEBeamSearcher:
 
															     def batch_find_best_experts(
														
 
															         self, batch_grid_scores: Sequence[Sequence[Sequence[float]]], beam_size: int, return_future: bool = False
														
 
															-    ) -> Union[List[List[RemoteExpert]], MPFuture]:
														
 
															+    ) -> Union[List[List[RemoteExpert]], MPFuture[List[List[RemoteExpert]]]]:
														
 
															         """
														
 
															         Find and return :beam_size: active experts with highest scores, use both local cache and DHT
														
@@ -364,7 +375,7 @@ class MoEBeamSearcher:
 
															         :param return_future: if set to True, returns MPFuture that can be awaited to get the actual result
														
 
															         :returns: a list that contains *up to* k_best RemoteExpert instances
														
 
															         """
														
 
															-        return self.dht.run_coroutine(
														
 
															+        result = self.dht.run_coroutine(
														
 
															             partial(
														
 
															                 self._batch_find_best_experts,
														
 
															                 prefix=self.uid_prefix,
														
@@ -376,6 +387,8 @@ class MoEBeamSearcher:
 
															             return_future,
														
 
															         )
														
 
															+        return batch_create_remote_experts(result, self.dht, return_future)
														
 
															+
														
 
															     @classmethod
														
 
															     async def _batch_find_best_experts(
														
 
															         cls,
														
@@ -386,7 +399,7 @@ class MoEBeamSearcher:
 
															         beam_size: int,
														
 
															         negative_caching: bool,
														
 
															         num_workers: Optional[int],
														
 
															-    ) -> Sequence[Sequence[RemoteExpert]]:
														
 
															+    ) -> Sequence[Sequence[RemoteExpertInfo]]:
														
 
															         batch_grid_scores = [
														
 
															             [tuple(grid_score[i]) for grid_score in batch_grid_scores] for i in range(len(batch_grid_scores[0]))
														
 
															         ]
														
--- a/hivemind/moe/client/expert.py
+++ b/hivemind/moe/client/expert.py
@@ -1,43 +1,68 @@
 
															-from typing import Any, Dict, Optional, Tuple
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from concurrent.futures import Future
														
 
															+from dataclasses import dataclass
														
 
															+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															 from torch.autograd.function import once_differentiable
														
 
															-from hivemind.compression import deserialize_torch_tensor, serialize_torch_tensor
														
 
															-from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
														
 
															-from hivemind.utils import Endpoint, MSGPackSerializer, nested_compare, nested_flatten, nested_pack
														
 
															-from hivemind.utils.grpc import ChannelCache
														
 
															+from hivemind import moe
														
 
															+from hivemind.compression import deserialize_tensor_stream, deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.dht import DHT
														
 
															+from hivemind.moe.client.remote_expert_worker import RemoteExpertWorker
														
 
															+from hivemind.p2p import P2P, PeerInfo, StubBase
														
 
															+from hivemind.p2p.p2p_daemon import DEFAULT_MAX_MSG_SIZE
														
 
															+from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils.asyncio import amap_in_executor, iter_as_aiter
														
 
															+from hivemind.utils.mpfuture import MPFuture
														
 
															+from hivemind.utils.nested import nested_compare, nested_flatten, nested_pack
														
 
															+from hivemind.utils.serializer import MSGPackSerializer
														
 
															+from hivemind.utils.streaming import split_for_streaming
														
 
															 DUMMY = torch.empty(0, requires_grad=True)  # dummy tensor that triggers autograd in RemoteExpert
														
 
															-def _get_expert_stub(endpoint: Endpoint, *extra_options: Tuple[str, Any]):
														
 
															-    """Create a gRPC stub to access remote expert or use previously created stub from a process-wide cache"""
														
 
															-    channel_options = (("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)) + extra_options
														
 
															-    return ChannelCache.get_stub(endpoint, runtime_grpc.ConnectionHandlerStub, aio=False, options=channel_options)
														
 
															+def get_expert_stub(p2p: P2P, server_peer_info: PeerInfo) -> "ConnectionHandlerStub":
														
 
															+    return moe.server.connection_handler.ConnectionHandler.get_stub(p2p, server_peer_info.peer_id)
														
 
															+
														
 
															+
														
 
															+@dataclass(frozen=True)
														
 
															+class RemoteExpertInfo:
														
 
															+    """A simple data class containing uid of expert and server PeerInfo"""
														
 
															+
														
 
															+    uid: str
														
 
															+    peer_info: PeerInfo
														
 
															 class RemoteExpert(nn.Module):
														
 
															     """
														
 
															     A simple module that runs forward/backward of an expert hosted on a remote machine.
														
 
															     Works seamlessly with pytorch autograd. (this is essentially a simple RPC function)
														
 
															-
														
 
															     Warning: RemoteExpert currently assumes that you provide it with correct input shapes.
														
 
															     Sending wrong input shapes can cause RemoteExpert to freeze indefinitely due to error in runtime.
														
 
															-    :param uid: unique expert identifier
														
 
															-    :param endpoint: network endpoint of a server that services that expert, e.g. "201.123.321.99:1337" or "[::]:8080"
														
 
															+    :param expert_info: RemoteExpertInfo with uid and server PeerInfo
														
 
															+    :param p2p: P2P instance connected to the running p2pd
														
 
															     """
														
 
															-    def __init__(self, uid, endpoint: Endpoint):
														
 
															+    def __init__(self, expert_info: RemoteExpertInfo, p2p: P2P):
														
 
															         super().__init__()
														
 
															-        self.uid, self.endpoint = uid, endpoint
														
 
															-        self._info = None
														
 
															+        self._info, self.p2p = expert_info, p2p
														
 
															+        self._rpc_info = None
														
 
															     @property
														
 
															-    def stub(self):
														
 
															-        return _get_expert_stub(self.endpoint)
														
 
															+    def uid(self):
														
 
															+        return self._info.uid
														
 
															+
														
 
															+    @property
														
 
															+    def server_peer_info(self):
														
 
															+        return self._info.peer_info
														
 
															+
														
 
															+    @property
														
 
															+    def stub(self) -> StubBase:
														
 
															+        return get_expert_stub(self.p2p, self.server_peer_info)
														
 
															     def forward(self, *args, **kwargs):
														
 
															         """Call RemoteExpert for the specified inputs and return its output(s). Compatible with pytorch.autograd."""
														
@@ -52,18 +77,125 @@ class RemoteExpert(nn.Module):
 
															             raise TypeError(f"Inputs do not match expert input schema. Did you pass the right number of parameters?")
														
 
															         flat_outputs = _RemoteModuleCall.apply(DUMMY, self.uid, self.stub, self.info, *nested_flatten(forward_inputs))
														
 
															+
														
 
															         # Note: we send DUMMY to prevent torch from excluding expert from backward if no other inputs require grad
														
 
															         return nested_pack(flat_outputs, structure=self.info["outputs_schema"])
														
 
															     @property
														
 
															     def info(self):
														
 
															-        if self._info is None:
														
 
															-            outputs = self.stub.info(runtime_pb2.ExpertUID(uid=self.uid))
														
 
															-            self._info = MSGPackSerializer.loads(outputs.serialized_info)
														
 
															-        return self._info
														
 
															+        if self._rpc_info is None:
														
 
															+            outputs = RemoteExpertWorker.run_coroutine(self.stub.rpc_info(runtime_pb2.ExpertUID(uid=self.uid)))
														
 
															+            self._rpc_info = MSGPackSerializer.loads(outputs.serialized_info)
														
 
															+        return self._rpc_info
														
 
															     def extra_repr(self):
														
 
															-        return f"uid={self.uid}, endpoint={self.endpoint}"
														
 
															+        return f"uid={self.uid}, server_peer_info={self.server_peer_info}"
														
 
															+
														
 
															+
														
 
															+def _create_remote_experts(infos: Sequence[Optional[RemoteExpertInfo]], p2p: P2P) -> List[Optional[RemoteExpert]]:
														
 
															+    experts: List[Optional[RemoteExpert]] = []
														
 
															+    for info in infos:
														
 
															+        if info is not None:
														
 
															+            experts.append(RemoteExpert(info, p2p))
														
 
															+        else:
														
 
															+            experts.append(None)
														
 
															+    return experts
														
 
															+
														
 
															+
														
 
															+def create_remote_experts(
														
 
															+    infos: Union[Sequence[Optional[RemoteExpertInfo]], MPFuture], dht: DHT, return_future: bool = False
														
 
															+) -> Union[List[Optional[RemoteExpert]], Future]:
														
 
															+    if return_future:
														
 
															+
														
 
															+        async def _unpack(infos_future: MPFuture, dht: DHT):
														
 
															+            p2p = await dht.replicate_p2p()
														
 
															+            return _create_remote_experts(await infos_future, p2p)
														
 
															+
														
 
															+        return RemoteExpertWorker.run_coroutine(_unpack(infos, dht), return_future)
														
 
															+
														
 
															+    p2p = RemoteExpertWorker.run_coroutine(dht.replicate_p2p())
														
 
															+    return _create_remote_experts(infos, p2p)
														
 
															+
														
 
															+
														
 
															+def batch_create_remote_experts(
														
 
															+    infos: Union[Sequence[Sequence[Optional[RemoteExpertInfo]]], MPFuture],
														
 
															+    dht: DHT,
														
 
															+    return_future: bool = False,
														
 
															+) -> Union[List[List[Optional[RemoteExpert]]], Future]:
														
 
															+    if return_future:
														
 
															+
														
 
															+        async def _unpack(infos_future: MPFuture, dht: DHT):
														
 
															+            p2p = await dht.replicate_p2p()
														
 
															+            return [_create_remote_experts(i, p2p) for i in await infos_future]
														
 
															+
														
 
															+        return RemoteExpertWorker.run_coroutine(_unpack(infos, dht), return_future)
														
 
															+
														
 
															+    return [create_remote_experts(exps, dht) for exps in infos]
														
 
															+
														
 
															+
														
 
															+async def _backward_stream(uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub) -> List[torch.Tensor]:
														
 
															+    split = (part for tensor in serialized_tensors for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE))
														
 
															+
														
 
															+    grad_inputs = await stub.rpc_backward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor: runtime_pb2.ExpertRequest(uid=uid, tensors=[tensor]),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+    tensors_stream = amap_in_executor(lambda msg: msg.tensors, grad_inputs)
														
 
															+    return await deserialize_tensor_stream(tensors_stream)
														
 
															+
														
 
															+
														
 
															+async def _backward_unary(uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub) -> List[torch.Tensor]:
														
 
															+    grad_inputs: runtime_pb2.ExpertResponse = await stub.rpc_backward(
														
 
															+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors))
														
 
															+    )
														
 
															+    return [deserialize_torch_tensor(t) for t in grad_inputs.tensors]
														
 
															+
														
 
															+
														
 
															+async def expert_backward(
														
 
															+    uid: str, inputs_and_grads: Sequence[torch.Tensor], serialized_tensors: Iterable[runtime_pb2.Tensor], stub
														
 
															+) -> List[torch.Tensor]:
														
 
															+    size = 0
														
 
															+    for t in inputs_and_grads:
														
 
															+        size += t.element_size() * t.nelement()
														
 
															+        if size > DEFAULT_MAX_MSG_SIZE:
														
 
															+            return await _backward_stream(uid, serialized_tensors, stub)
														
 
															+    else:
														
 
															+        return await _backward_unary(uid, serialized_tensors, stub)
														
 
															+
														
 
															+
														
 
															+async def _forward_stream(uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub) -> List[torch.Tensor]:
														
 
															+    split = (p for t in serialized_tensors for p in split_for_streaming(t, DEFAULT_MAX_MSG_SIZE))
														
 
															+
														
 
															+    outputs = await stub.rpc_forward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor: runtime_pb2.ExpertRequest(uid=uid, tensors=[tensor]),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+
														
 
															+    tensors_stream = amap_in_executor(lambda msg: msg.tensors, outputs)
														
 
															+    return await deserialize_tensor_stream(tensors_stream)
														
 
															+
														
 
															+
														
 
															+async def _forward_unary(uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub) -> List[torch.Tensor]:
														
 
															+    outputs: runtime_pb2.ExpertResponse = await stub.rpc_forward(
														
 
															+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors))
														
 
															+    )
														
 
															+    return [deserialize_torch_tensor(t) for t in outputs.tensors]
														
 
															+
														
 
															+
														
 
															+async def expert_forward(
														
 
															+    uid: str, inputs: Sequence[torch.Tensor], serialized_tensors: Iterable[runtime_pb2.Tensor], stub
														
 
															+) -> List[torch.Tensor]:
														
 
															+    size = 0
														
 
															+    for t in inputs:
														
 
															+        size += t.element_size() * t.nelement()
														
 
															+        if size > DEFAULT_MAX_MSG_SIZE:
														
 
															+            return await _forward_stream(uid, serialized_tensors, stub)
														
 
															+    else:
														
 
															+        return await _forward_unary(uid, serialized_tensors, stub)
														
 
															 class _RemoteModuleCall(torch.autograd.Function):
														
@@ -74,7 +206,7 @@ class _RemoteModuleCall(torch.autograd.Function):
 
															         ctx,
														
 
															         dummy: torch.Tensor,
														
 
															         uid: str,
														
 
															-        stub: runtime_grpc.ConnectionHandlerStub,
														
 
															+        stub: "ConnectionHandlerStub",
														
 
															         info: Dict[str, Any],
														
 
															         *inputs: torch.Tensor,
														
 
															     ) -> Tuple[torch.Tensor, ...]:
														
@@ -83,15 +215,11 @@ class _RemoteModuleCall(torch.autograd.Function):
 
															         inputs = tuple(tensor.cpu().detach() for tensor in inputs)
														
 
															         ctx.uid, ctx.stub, ctx.info = uid, stub, info
														
 
															         ctx.save_for_backward(*inputs)
														
 
															-
														
 
															-        serialized_tensors = [
														
 
															-            serialize_torch_tensor(inp, proto.compression)
														
 
															-            for inp, proto in zip(inputs, nested_flatten(info["forward_schema"]))
														
 
															-        ]
														
 
															-
														
 
															-        outputs = stub.forward(runtime_pb2.ExpertRequest(uid=ctx.uid, tensors=serialized_tensors))
														
 
															-
														
 
															-        deserialized_outputs = [deserialize_torch_tensor(tensor) for tensor in outputs.tensors]
														
 
															+        serialized_tensors = (
														
 
															+            serialize_torch_tensor(tensor, proto.compression)
														
 
															+            for tensor, proto in zip(inputs, nested_flatten(info["forward_schema"]))
														
 
															+        )
														
 
															+        deserialized_outputs = RemoteExpertWorker.run_coroutine(expert_forward(uid, inputs, serialized_tensors, stub))
														
 
															         return tuple(deserialized_outputs)
														
@@ -101,12 +229,12 @@ class _RemoteModuleCall(torch.autograd.Function):
 
															         grad_outputs_cpu = tuple(tensor.cpu() for tensor in grad_outputs)
														
 
															         inputs_and_grad_outputs = tuple(nested_flatten((ctx.saved_tensors, grad_outputs_cpu)))
														
 
															         backward_schema = tuple(nested_flatten((ctx.info["forward_schema"], ctx.info["outputs_schema"])))
														
 
															-        serialized_tensors = [
														
 
															+        serialized_tensors = (
														
 
															             serialize_torch_tensor(tensor, proto.compression)
														
 
															             for tensor, proto in zip(inputs_and_grad_outputs, backward_schema)
														
 
															-        ]
														
 
															-
														
 
															-        grad_inputs = ctx.stub.backward(runtime_pb2.ExpertRequest(uid=ctx.uid, tensors=serialized_tensors))
														
 
															+        )
														
 
															+        deserialized_grad_inputs = RemoteExpertWorker.run_coroutine(
														
 
															+            expert_backward(ctx.uid, inputs_and_grad_outputs, serialized_tensors, ctx.stub)
														
 
															+        )
														
 
															-        deserialized_grad_inputs = [deserialize_torch_tensor(tensor) for tensor in grad_inputs.tensors]
														
 
															         return (DUMMY, None, None, None, *deserialized_grad_inputs)
														
--- a/hivemind/moe/client/moe.py
+++ b/hivemind/moe/client/moe.py
@@ -1,20 +1,21 @@
 
															 from __future__ import annotations
														
 
															 import time
														
 
															+from concurrent.futures import Future
														
 
															 from queue import Empty, Queue
														
 
															 from typing import Any, Dict, List, Optional, Tuple
														
 
															-import grpc
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															 from torch.autograd.function import once_differentiable
														
 
															-from hivemind.compression import deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.compression import serialize_torch_tensor
														
 
															 from hivemind.dht import DHT
														
 
															 from hivemind.moe.client.beam_search import MoEBeamSearcher
														
 
															-from hivemind.moe.client.expert import DUMMY, RemoteExpert, _get_expert_stub
														
 
															+from hivemind.moe.client.expert import DUMMY, RemoteExpert, expert_backward, expert_forward, get_expert_stub
														
 
															+from hivemind.moe.client.remote_expert_worker import RemoteExpertWorker
														
 
															 from hivemind.moe.server.expert_uid import UID_DELIMITER
														
 
															-from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
														
 
															+from hivemind.p2p.p2p_daemon_bindings.control import P2PDaemonError
														
 
															 from hivemind.utils import nested_flatten, nested_map, nested_pack
														
 
															 from hivemind.utils.logging import get_logger
														
@@ -104,7 +105,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
															                     "No responding experts found during beam search. Check that UID prefixes and "
														
 
															                     "the grid size are consistent with running Server instances."
														
 
															                 )
														
 
															-            except grpc.RpcError as e:
														
 
															+            except P2PDaemonError as e:
														
 
															                 logger.warning(f"Failed to get RemoteMixtureOfExperts.output_shape: {e}")
														
 
															         expert_mask, *expert_outputs = _RemoteCallMany.apply(
														
@@ -178,7 +179,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
															             # grab some expert to set ensemble output shape
														
 
															             proj_device = self.proj.weight.device
														
 
															             dummy_scores_concat = self.proj(torch.randn(1, self.proj.in_features, device=proj_device))
														
 
															-            dummy_scores = dummy_scores_concat.cpu().split_with_sizes(self.beam_search.grid_size, dim=-1)
														
 
															+            dummy_scores = dummy_scores_concat.cpu().detach().split_with_sizes(self.beam_search.grid_size, dim=-1)
														
 
															             dummy_experts = self.beam_search.find_best_experts(dummy_scores, beam_size=1)
														
 
															             self._expert_info = dummy_experts[0].info
														
 
															         return self._expert_info
														
@@ -223,15 +224,18 @@ class _RemoteCallMany(torch.autograd.Function):
 
															         assert len(experts_per_sample) == len(flat_inputs_per_sample) == num_samples
														
 
															         # dispatch tasks to all remote experts collect responses
														
 
															-        pending_tasks: Dict[grpc.Future, Tuple[int, int]] = {}
														
 
															+        pending_tasks: Dict[Future, Tuple[int, int]] = {}
														
 
															         for i in range(num_samples):
														
 
															             for j, expert in enumerate(experts_per_sample[i]):
														
 
															-                input_tensors = [
														
 
															+                stub = get_expert_stub(expert.p2p, expert.server_peer_info)
														
 
															+                serialized_tensors = (
														
 
															                     serialize_torch_tensor(tensor, proto.compression)
														
 
															                     for tensor, proto in zip(flat_inputs_per_sample[i], nested_flatten(info["forward_schema"]))
														
 
															-                ]
														
 
															-                stub: runtime_grpc.ConnectionHandlerStub = _get_expert_stub(expert.endpoint)
														
 
															-                new_task = stub.forward.future(runtime_pb2.ExpertRequest(uid=expert.uid, tensors=input_tensors))
														
 
															+                )
														
 
															+                new_task = RemoteExpertWorker.run_coroutine(
														
 
															+                    expert_forward(expert.uid, flat_inputs_per_sample[i], serialized_tensors, stub),
														
 
															+                    return_future=True,
														
 
															+                )
														
 
															                 pending_tasks[new_task] = (i, j)
														
 
															         responded_inds, alive_flat_outputs = cls._collect_responses(
														
@@ -316,14 +320,16 @@ class _RemoteCallMany(torch.autograd.Function):
 
															         for i, j, inputs_ij, grad_outputs_ij in zip(
														
 
															             alive_ii.cpu().numpy(), alive_jj.cpu().numpy(), inputs_per_expert, grad_outputs_per_expert
														
 
															         ):
														
 
															-            expert = expert_per_sample[i.item()][j.item()]
														
 
															-            stub = _get_expert_stub(expert.endpoint)
														
 
															+            expert: RemoteExpert = expert_per_sample[i.item()][j.item()]
														
 
															+            stub = get_expert_stub(expert.p2p, expert.server_peer_info)
														
 
															             inputs_and_grad_outputs = tuple(nested_flatten((inputs_ij, grad_outputs_ij)))
														
 
															-            tensors_serialized = [
														
 
															+            serialized_tensors = (
														
 
															                 serialize_torch_tensor(tensor, proto.compression)
														
 
															                 for tensor, proto in zip(inputs_and_grad_outputs, backward_schema)
														
 
															-            ]
														
 
															-            new_task = stub.backward.future(runtime_pb2.ExpertRequest(uid=expert.uid, tensors=tensors_serialized))
														
 
															+            )
														
 
															+            new_task = RemoteExpertWorker.run_coroutine(
														
 
															+                expert_backward(expert.uid, inputs_and_grad_outputs, serialized_tensors, stub), return_future=True
														
 
															+            )
														
 
															             pending_tasks[new_task] = (i, j)
														
 
															         survivor_inds, survivor_grad_inputs = cls._collect_responses(
														
@@ -358,7 +364,7 @@ class _RemoteCallMany(torch.autograd.Function):
 
															     @staticmethod
														
 
															     def _collect_responses(
														
 
															-        task_to_indices: Dict[grpc.Future, Tuple[int, int]],
														
 
															+        task_to_indices: Dict[Future, Tuple[int, int]],
														
 
															         num_samples: int,
														
 
															         k_min: int,
														
 
															         timeout_total: Optional[float],
														
@@ -408,17 +414,15 @@ class _RemoteCallMany(torch.autograd.Function):
 
															         return finished_indices, finished_outputs
														
 
															-def _process_dispatched_task(task: grpc.Future, detect_anomalies: bool) -> Optional[Tuple[torch.Tensor]]:
														
 
															+def _process_dispatched_task(task: Future, detect_anomalies: bool) -> Optional[Tuple[torch.Tensor]]:
														
 
															     if task.exception() or task.cancelled():
														
 
															         logger.warning(f"Task {task} failed: {type(task.exception())}")
														
 
															         return None
														
 
															-    deserialized_outputs = []
														
 
															-    for tensor in task.result().tensors:
														
 
															-        deserialized_tensor = deserialize_torch_tensor(tensor)
														
 
															-        if detect_anomalies and not deserialized_tensor.isfinite().all():
														
 
															+    outputs = task.result()
														
 
															+    for tensor in outputs:
														
 
															+        if detect_anomalies and not tensor.isfinite().all():
														
 
															             logger.error(f"Task {task} failed: output tensor contains nan/inf values")
														
 
															             return None
														
 
															-        deserialized_outputs.append(deserialized_tensor)
														
 
															-    return tuple(deserialized_outputs)
														
 
															+    return outputs
														
--- a/hivemind/moe/client/remote_expert_worker.py
+++ b/hivemind/moe/client/remote_expert_worker.py
@@ -0,0 +1,48 @@
 
															+import os
														
 
															+from concurrent.futures import Future
														
 
															+from queue import Queue
														
 
															+from threading import Thread
														
 
															+from typing import Awaitable, Optional
														
 
															+
														
 
															+from hivemind.utils import switch_to_uvloop
														
 
															+
														
 
															+
														
 
															+class RemoteExpertWorker:
														
 
															+    """Local thread for managing async tasks related to RemoteExpert"""
														
 
															+
														
 
															+    _task_queue: Queue = Queue()
														
 
															+    _event_thread: Optional[Thread] = None
														
 
															+    _pid: int = -1
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _run(cls):
														
 
															+        loop = switch_to_uvloop()
														
 
															+
														
 
															+        async def receive_tasks():
														
 
															+            while True:
														
 
															+                cor, future = cls._task_queue.get()
														
 
															+                try:
														
 
															+                    result = await cor
														
 
															+                except Exception as e:
														
 
															+                    future.set_exception(e)
														
 
															+                    continue
														
 
															+                if not future.cancelled():
														
 
															+                    future.set_result(result)
														
 
															+
														
 
															+        loop.run_until_complete(receive_tasks())
														
 
															+
														
 
															+    @classmethod
														
 
															+    def run_coroutine(cls, coro: Awaitable, return_future: bool = False):
														
 
															+        if cls._event_thread is None or cls._pid != os.getpid():
														
 
															+            cls._pid = os.getpid()
														
 
															+            cls._event_thread = Thread(target=cls._run, daemon=True)
														
 
															+            cls._event_thread.start()
														
 
															+
														
 
															+        future = Future()
														
 
															+        cls._task_queue.put((coro, future))
														
 
															+
														
 
															+        if return_future:
														
 
															+            return future
														
 
															+
														
 
															+        result = future.result()
														
 
															+        return result
														
--- a/hivemind/moe/client/switch_moe.py
+++ b/hivemind/moe/client/switch_moe.py
@@ -2,12 +2,12 @@ from __future__ import annotations
 
															 from typing import List, Tuple
														
 
															-import grpc
														
 
															 import torch
														
 
															 from hivemind.moe.client.expert import DUMMY, RemoteExpert
														
 
															 from hivemind.moe.client.moe import RemoteMixtureOfExperts, _RemoteCallMany
														
 
															 from hivemind.moe.server.expert_uid import UID_DELIMITER
														
 
															+from hivemind.p2p.p2p_daemon_bindings.control import P2PDaemonError
														
 
															 from hivemind.utils import nested_flatten, nested_pack
														
 
															 from hivemind.utils.logging import get_logger
														
@@ -110,7 +110,7 @@ class RemoteSwitchMixtureOfExperts(RemoteMixtureOfExperts):
 
															                     "No responding experts found during beam search. Check that UID prefixes and "
														
 
															                     "the grid size are consistent with running Server instances."
														
 
															                 )
														
 
															-            except grpc.RpcError as e:
														
 
															+            except P2PDaemonError as e:
														
 
															                 logger.warning(f"Failed to get RemoteSwitchMixtureOfExperts.output_shape: {e}")
														
 
															         expert_mask, *expert_outputs = _RemoteCallMany.apply(
														
--- a/hivemind/moe/server/connection_handler.py
+++ b/hivemind/moe/server/connection_handler.py
@@ -1,82 +1,136 @@
 
															+import asyncio
														
 
															 import multiprocessing as mp
														
 
															-import os
														
 
															-from typing import Dict
														
 
															+from typing import AsyncIterator, Dict, Iterable, List, Optional, Tuple, Union
														
 
															-import grpc
														
 
															 import torch
														
 
															-from hivemind.compression import deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.compression import deserialize_tensor_stream, deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.dht import DHT
														
 
															 from hivemind.moe.server.expert_backend import ExpertBackend
														
 
															-from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
														
 
															-from hivemind.utils import Endpoint, MSGPackSerializer, get_logger, nested_flatten
														
 
															-from hivemind.utils.asyncio import switch_to_uvloop
														
 
															-from hivemind.utils.grpc import GRPC_KEEPALIVE_OPTIONS
														
 
															+from hivemind.moe.server.task_pool import TaskPool
														
 
															+from hivemind.p2p import P2PContext, ServicerBase
														
 
															+from hivemind.p2p.p2p_daemon import DEFAULT_MAX_MSG_SIZE, P2P
														
 
															+from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils import MPFuture, MSGPackSerializer, as_aiter, get_logger, nested_flatten
														
 
															+from hivemind.utils.asyncio import amap_in_executor, switch_to_uvloop
														
 
															+from hivemind.utils.streaming import split_for_streaming
														
 
															+from hivemind.utils.tensor_descr import BatchTensorDescriptor
														
 
															 logger = get_logger(__name__)
														
 
															-class ConnectionHandler(mp.context.ForkProcess):
														
 
															+class ConnectionHandler(mp.context.ForkProcess, ServicerBase):
														
 
															     """
														
 
															     A process that accepts incoming requests to experts and submits them into the corresponding TaskPool.
														
 
															-    :note: ConnectionHandler is designed so as to allow using multiple handler processes for the same port.
														
 
															-    :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
														
 
															+    :note: ConnectionHandler is designed so as to allow using multiple handler processes for the same port
														
 
															+    :param dht: a running hivemind.dht.DHT, used to let other peers connect to this one
														
 
															     :param experts: a dict [UID -> ExpertBackend] with all active experts
														
 
															     """
														
 
															-    def __init__(self, listen_on: Endpoint, experts: Dict[str, ExpertBackend]):
														
 
															+    def __init__(self, dht: DHT, experts: Dict[str, ExpertBackend]):
														
 
															         super().__init__()
														
 
															-        self.listen_on, self.experts = listen_on, experts
														
 
															-        self.ready = mp.Event()
														
 
															+        self.dht, self.experts = dht, experts
														
 
															+        self._p2p: Optional[P2P] = None
														
 
															+
														
 
															+        self.ready = MPFuture()
														
 
															     def run(self):
														
 
															         torch.set_num_threads(1)
														
 
															         loop = switch_to_uvloop()
														
 
															         async def _run():
														
 
															-            grpc.aio.init_grpc_aio()
														
 
															-            logger.debug(f"Starting, pid {os.getpid()}")
														
 
															-            server = grpc.aio.server(
														
 
															-                options=GRPC_KEEPALIVE_OPTIONS
														
 
															-                + (
														
 
															-                    ("grpc.so_reuseport", 1),
														
 
															-                    ("grpc.max_send_message_length", -1),
														
 
															-                    ("grpc.max_receive_message_length", -1),
														
 
															-                )
														
 
															-            )
														
 
															-            runtime_grpc.add_ConnectionHandlerServicer_to_server(self, server)
														
 
															-
														
 
															-            found_port = server.add_insecure_port(self.listen_on)
														
 
															-            assert found_port != 0, f"Failed to listen to {self.listen_on}"
														
 
															-
														
 
															-            await server.start()
														
 
															-            self.ready.set()
														
 
															-            await server.wait_for_termination()
														
 
															-            logger.debug(f"ConnectionHandler terminated: (pid={os.getpid()})")
														
 
															+            try:
														
 
															+                self._p2p = await self.dht.replicate_p2p()
														
 
															+                await self.add_p2p_handlers(self._p2p, balanced=True)
														
 
															+
														
 
															+                # wait forever
														
 
															+                await asyncio.Future()
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                self.ready.set_exception(e)
														
 
															+                return
														
 
															+
														
 
															+        self.ready.set_result(None)
														
 
															         try:
														
 
															             loop.run_until_complete(_run())
														
 
															         except KeyboardInterrupt:
														
 
															             logger.debug("Caught KeyboardInterrupt, shutting down")
														
 
															-    async def info(self, request: runtime_pb2.ExpertUID, context: grpc.ServicerContext):
														
 
															+    async def rpc_info(self, request: runtime_pb2.ExpertUID, context: P2PContext) -> runtime_pb2.ExpertInfo:
														
 
															         return runtime_pb2.ExpertInfo(serialized_info=MSGPackSerializer.dumps(self.experts[request.uid].get_info()))
														
 
															-    async def forward(self, request: runtime_pb2.ExpertRequest, context: grpc.ServicerContext):
														
 
															-        inputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															-        future = self.experts[request.uid].forward_pool.submit_task(*inputs)
														
 
															-        serialized_response = [
														
 
															-            serialize_torch_tensor(tensor, proto.compression, allow_inplace=True)
														
 
															-            for tensor, proto in zip(await future, nested_flatten(self.experts[request.uid].outputs_schema))
														
 
															+    async def _gather_inputs(
														
 
															+        self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															+    ) -> Tuple[str, List[torch.Tensor]]:
														
 
															+        expert_uid = None
														
 
															+
														
 
															+        def _unpack(req: runtime_pb2.ExpertRequest) -> Iterable[runtime_pb2.Tensor]:
														
 
															+            nonlocal expert_uid
														
 
															+
														
 
															+            if expert_uid is None:
														
 
															+                expert_uid = req.uid
														
 
															+            elif expert_uid != req.uid:
														
 
															+                raise ValueError("Expert uids differ in one request")
														
 
															+
														
 
															+            return req.tensors
														
 
															+
														
 
															+        tensors_stream = amap_in_executor(_unpack, requests)
														
 
															+        inputs = await deserialize_tensor_stream(tensors_stream)
														
 
															+        return expert_uid, inputs
														
 
															+
														
 
															+    async def _process_inputs(
														
 
															+        self,
														
 
															+        inputs: List[torch.Tensor],
														
 
															+        pool: TaskPool,
														
 
															+        schema: Union[BatchTensorDescriptor, Tuple[BatchTensorDescriptor, ...]],
														
 
															+    ) -> List[runtime_pb2.Tensor]:
														
 
															+        return [
														
 
															+            serialize_torch_tensor(result, proto.compression, allow_inplace=True)
														
 
															+            for result, proto in zip(await pool.submit_task(*inputs), nested_flatten(schema))
														
 
															         ]
														
 
															-        return runtime_pb2.ExpertResponse(tensors=serialized_response)
														
 
															+    async def rpc_forward(self, request: runtime_pb2.ExpertRequest, context: P2PContext) -> runtime_pb2.ExpertResponse:
														
 
															+        inputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															+        expert = self.experts[request.uid]
														
 
															+        return runtime_pb2.ExpertResponse(
														
 
															+            tensors=await self._process_inputs(inputs, expert.forward_pool, expert.outputs_schema)
														
 
															+        )
														
 
															+
														
 
															+    async def rpc_forward_stream(
														
 
															+        self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															+    ) -> AsyncIterator[runtime_pb2.ExpertRequest]:
														
 
															+        uid, inputs = await self._gather_inputs(requests, context)
														
 
															+        expert = self.experts[uid]
														
 
															+        output_split = [
														
 
															+            part
														
 
															+            for tensor in await self._process_inputs(inputs, expert.forward_pool, expert.outputs_schema)
														
 
															+            for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE)
														
 
															+        ]
														
 
															-    async def backward(self, request: runtime_pb2.ExpertRequest, context: grpc.ServicerContext):
														
 
															-        inputs_and_grad_outputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															-        future = self.experts[request.uid].backward_pool.submit_task(*inputs_and_grad_outputs)
														
 
															-        serialized_response = [
														
 
															-            serialize_torch_tensor(tensor, proto.compression, allow_inplace=True)
														
 
															-            for tensor, proto in zip(await future, nested_flatten(self.experts[request.uid].grad_inputs_schema))
														
 
															+        async for part in as_aiter(*output_split):
														
 
															+            yield runtime_pb2.ExpertResponse(tensors=[part])
														
 
															+
														
 
															+    async def rpc_backward(
														
 
															+        self, request: runtime_pb2.ExpertRequest, context: P2PContext
														
 
															+    ) -> runtime_pb2.ExpertResponse:
														
 
															+        inputs_and_grads = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															+        expert = self.experts[request.uid]
														
 
															+        return runtime_pb2.ExpertResponse(
														
 
															+            tensors=await self._process_inputs(inputs_and_grads, expert.backward_pool, expert.grad_inputs_schema)
														
 
															+        )
														
 
															+
														
 
															+    async def rpc_backward_stream(
														
 
															+        self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															+    ) -> AsyncIterator[runtime_pb2.ExpertResponse]:
														
 
															+        uid, inputs_and_grads = await self._gather_inputs(requests, context)
														
 
															+        expert = self.experts[uid]
														
 
															+        output_split = [
														
 
															+            part
														
 
															+            for tensor in await self._process_inputs(inputs_and_grads, expert.backward_pool, expert.grad_inputs_schema)
														
 
															+            for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE)
														
 
															         ]
														
 
															-        return runtime_pb2.ExpertResponse(tensors=serialized_response)
														
 
															+
														
 
															+        async for part in as_aiter(*output_split):
														
 
															+            yield runtime_pb2.ExpertResponse(tensors=[part])
														
--- a/hivemind/moe/server/dht_handler.py
+++ b/hivemind/moe/server/dht_handler.py
@@ -1,9 +1,9 @@
 
															 import threading
														
 
															 from functools import partial
														
 
															-from typing import Dict, List, Optional, Sequence, Tuple
														
 
															+from typing import Dict, List, Optional, Sequence, Tuple, Union
														
 
															 from hivemind.dht import DHT, DHTExpiration, DHTNode, DHTValue
														
 
															-from hivemind.moe.client.expert import RemoteExpert
														
 
															+from hivemind.moe.client.expert import RemoteExpert, RemoteExpertInfo, create_remote_experts
														
 
															 from hivemind.moe.server.expert_uid import (
														
 
															     FLAT_EXPERT,
														
 
															     UID_DELIMITER,
														
@@ -14,33 +14,31 @@ from hivemind.moe.server.expert_uid import (
 
															     is_valid_uid,
														
 
															     split_uid,
														
 
															 )
														
 
															-from hivemind.utils import Endpoint, get_dht_time, get_port
														
 
															+from hivemind.p2p import PeerID, PeerInfo
														
 
															+from hivemind.utils import MPFuture, get_dht_time
														
 
															 class DHTHandlerThread(threading.Thread):
														
 
															-    def __init__(self, experts, dht: DHT, endpoint: Endpoint, update_period: int = 5, **kwargs):
														
 
															+    def __init__(self, experts, dht: DHT, update_period: int = 5, **kwargs):
														
 
															         super().__init__(**kwargs)
														
 
															-        assert get_port(endpoint) is not None
														
 
															-        self.endpoint = endpoint
														
 
															         self.experts = experts
														
 
															         self.dht = dht
														
 
															         self.update_period = update_period
														
 
															         self.stop = threading.Event()
														
 
															     def run(self) -> None:
														
 
															-        declare_experts(self.dht, self.experts.keys(), self.endpoint)
														
 
															+        declare_experts(self.dht, self.experts.keys())
														
 
															         while not self.stop.wait(self.update_period):
														
 
															-            declare_experts(self.dht, self.experts.keys(), self.endpoint)
														
 
															+            declare_experts(self.dht, self.experts.keys())
														
 
															 def declare_experts(
														
 
															-    dht: DHT, uids: Sequence[ExpertUID], endpoint: Endpoint, expiration: DHTExpiration = 300, wait: bool = True
														
 
															-) -> Dict[ExpertUID, bool]:
														
 
															+    dht: DHT, uids: Sequence[ExpertUID], expiration: DHTExpiration = 300, wait: bool = True
														
 
															+) -> Union[Dict[ExpertUID, bool], MPFuture[Dict[ExpertUID, bool]]]:
														
 
															     """
														
 
															     Make experts visible to all DHT peers; update timestamps if declared previously.
														
 
															     :param uids: a list of expert ids to update
														
 
															-    :param endpoint: endpoint that serves these experts, usually your server endpoint (e.g. "201.111.222.333:1337")
														
 
															     :param wait: if True, awaits for declaration to finish, otherwise runs in background
														
 
															     :param expiration: experts will be visible for this many seconds
														
 
															     :returns: if wait, returns store status for every key (True = store succeeded, False = store rejected)
														
@@ -48,23 +46,25 @@ def declare_experts(
 
															     assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
														
 
															     for uid in uids:
														
 
															         assert is_valid_uid(uid), f"{uid} is not a valid expert uid. All uids must follow {UID_PATTERN.pattern}"
														
 
															+    addrs = tuple(str(a.decapsulate("/p2p/" + a.get("p2p"))) for a in dht.get_visible_maddrs())
														
 
															     return dht.run_coroutine(
														
 
															-        partial(_declare_experts, uids=list(uids), endpoint=endpoint, expiration=expiration), return_future=not wait
														
 
															+        partial(_declare_experts, uids=list(uids), peer_id=dht.peer_id, addrs=addrs, expiration=expiration),
														
 
															+        return_future=not wait,
														
 
															     )
														
 
															 async def _declare_experts(
														
 
															-    dht: DHT, node: DHTNode, uids: List[ExpertUID], endpoint: Endpoint, expiration: DHTExpiration
														
 
															+    dht: DHT, node: DHTNode, uids: List[ExpertUID], peer_id: PeerID, addrs: Tuple[str], expiration: DHTExpiration
														
 
															 ) -> Dict[ExpertUID, bool]:
														
 
															     num_workers = len(uids) if dht.num_workers is None else min(len(uids), dht.num_workers)
														
 
															     expiration_time = get_dht_time() + expiration
														
 
															     data_to_store: Dict[Tuple[ExpertPrefix, Optional[Coordinate]], DHTValue] = {}
														
 
															     for uid in uids:
														
 
															-        data_to_store[uid, None] = endpoint
														
 
															+        data_to_store[uid, None] = (peer_id.to_base58(), addrs)
														
 
															         prefix = uid if uid.count(UID_DELIMITER) > 1 else f"{uid}{UID_DELIMITER}{FLAT_EXPERT}"
														
 
															         for i in range(prefix.count(UID_DELIMITER) - 1):
														
 
															             prefix, last_coord = split_uid(prefix)
														
 
															-            data_to_store[prefix, last_coord] = [uid, endpoint]
														
 
															+            data_to_store[prefix, last_coord] = [uid, (peer_id.to_base58(), addrs)]
														
 
															     keys, maybe_subkeys, values = zip(*((key, subkey, value) for (key, subkey), value in data_to_store.items()))
														
 
															     store_ok = await node.store_many(keys, values, expiration_time, subkeys=maybe_subkeys, num_workers=num_workers)
														
@@ -73,7 +73,7 @@ async def _declare_experts(
 
															 def get_experts(
														
 
															     dht: DHT, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration] = None, return_future: bool = False
														
 
															-) -> List[Optional[RemoteExpert]]:
														
 
															+) -> Union[List[Optional[RemoteExpert]], MPFuture[List[Optional[RemoteExpert]]]]:
														
 
															     """
														
 
															     :param uids: find experts with these ids from across the DHT
														
 
															     :param expiration_time: if specified, return experts that expire no sooner than this (based on get_dht_time)
														
@@ -81,12 +81,13 @@ def get_experts(
 
															     :returns: a list of [RemoteExpert if found else None]
														
 
															     """
														
 
															     assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
														
 
															-    return dht.run_coroutine(partial(_get_experts, uids=list(uids), expiration_time=expiration_time), return_future)
														
 
															+    result = dht.run_coroutine(partial(_get_experts, uids=list(uids), expiration_time=expiration_time), return_future)
														
 
															+    return create_remote_experts(result, dht, return_future)
														
 
															 async def _get_experts(
														
 
															     dht: DHT, node: DHTNode, uids: List[ExpertUID], expiration_time: Optional[DHTExpiration]
														
 
															-) -> List[Optional[RemoteExpert]]:
														
 
															+) -> List[Optional[RemoteExpertInfo]]:
														
 
															     if expiration_time is None:
														
 
															         expiration_time = get_dht_time()
														
 
															     num_workers = len(uids) if dht.num_workers is None else min(len(uids), dht.num_workers)
														
@@ -94,6 +95,7 @@ async def _get_experts(
 
															     experts: List[Optional[RemoteExpert]] = [None] * len(uids)
														
 
															     for i, uid in enumerate(uids):
														
 
															-        if found[uid] is not None and isinstance(found[uid].value, Endpoint):
														
 
															-            experts[i] = RemoteExpert(uid, found[uid].value)
														
 
															+        expert_info_for_uid = found[uid]
														
 
															+        if expert_info_for_uid is not None and isinstance(expert_info_for_uid.value, tuple):
														
 
															+            experts[i] = RemoteExpertInfo(uid, PeerInfo.from_tuple(expert_info_for_uid.value))
														
 
															     return experts
														
--- a/hivemind/moe/server/expert_uid.py
+++ b/hivemind/moe/server/expert_uid.py
@@ -1,10 +1,10 @@
 
															 import re
														
 
															 from typing import NamedTuple, Tuple, Union
														
 
															-from hivemind.utils import Endpoint
														
 
															+from hivemind.p2p.p2p_daemon_bindings.datastructures import PeerInfo
														
 
															 ExpertUID, ExpertPrefix, Coordinate, Score = str, str, int, float
														
 
															-UidEndpoint = NamedTuple("UidEndpoint", [("uid", ExpertUID), ("endpoint", Endpoint)])
														
 
															+UidEndpoint = NamedTuple("UidEndpoint", [("uid", ExpertUID), ("peer_info", PeerInfo)])
														
 
															 UID_DELIMITER = "."  # when declaring experts, DHT store all prefixes of that expert's uid, split over this prefix
														
 
															 FLAT_EXPERT = -1  # grid prefix reserved for storing 1d expert uids. Used to speed up find_best_experts in 1d case.
														
 
															 UID_PATTERN = re.compile("^(([^.])+)([.](?:[0]|([1-9]([0-9]*))))+$")  # e.g. ffn_expert.98.76.54 - prefix + some dims
														
--- a/hivemind/moe/server/server.py
+++ b/hivemind/moe/server/server.py
@@ -24,9 +24,9 @@ from hivemind.moe.server.layers import (
 
															     schedule_name_to_scheduler,
														
 
															 )
														
 
															 from hivemind.moe.server.runtime import Runtime
														
 
															+from hivemind.p2p import PeerInfo
														
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															 from hivemind.utils.logging import get_logger
														
 
															-from hivemind.utils.networking import Endpoint, get_free_port, get_port, replace_port
														
 
															 from hivemind.utils.tensor_descr import DUMMY_BATCH_SIZE, BatchTensorDescriptor
														
 
															 logger = get_logger(__name__)
														
@@ -41,10 +41,8 @@ class Server(threading.Thread):
 
															      - processes incoming forward/backward requests via Runtime (created by the server)
														
 
															      - publishes updates to expert status every :update_period: seconds
														
 
															-    :type dht: DHT or None. Server with dht=None will NOT be visible from DHT,
														
 
															-     but it will still support accessing experts directly with RemoteExpert(uid=UID, endpoint="IPADDR:PORT").
														
 
															+    :type dht: an instance of hivemind.DHT. Server will use DHT for all network interactions.
														
 
															     :param expert_backends: dict{expert uid (str) : ExpertBackend} for all expert hosted by this server.
														
 
															-    :param listen_on: server's dht address that determines how it can be accessed. Address and (optional) port
														
 
															     :param num_connection_handlers: maximum number of simultaneous requests. Please note that the default value of 1
														
 
															         if too small for normal functioning, we recommend 4 handlers per expert backend.
														
 
															     :param update_period: how often will server attempt to publish its state (i.e. experts) to the DHT;
														
@@ -55,9 +53,8 @@ class Server(threading.Thread):
 
															     def __init__(
														
 
															         self,
														
 
															-        dht: Optional[DHT],
														
 
															+        dht: DHT,
														
 
															         expert_backends: Dict[str, ExpertBackend],
														
 
															-        listen_on: Endpoint = "0.0.0.0:*",
														
 
															         num_connection_handlers: int = 1,
														
 
															         update_period: int = 30,
														
 
															         start=False,
														
@@ -66,22 +63,18 @@ class Server(threading.Thread):
 
															     ):
														
 
															         super().__init__()
														
 
															         self.dht, self.experts, self.update_period = dht, expert_backends, update_period
														
 
															-        if get_port(listen_on) is None:
														
 
															-            listen_on = replace_port(listen_on, new_port=get_free_port())
														
 
															-        self.listen_on, self.port = listen_on, get_port(listen_on)
														
 
															-        self.conn_handlers = [ConnectionHandler(listen_on, self.experts) for _ in range(num_connection_handlers)]
														
 
															+        self.conn_handlers = [ConnectionHandler(dht, self.experts) for _ in range(num_connection_handlers)]
														
 
															         if checkpoint_dir is not None:
														
 
															             self.checkpoint_saver = CheckpointSaver(expert_backends, checkpoint_dir, update_period)
														
 
															         else:
														
 
															             self.checkpoint_saver = None
														
 
															         self.runtime = Runtime(self.experts, **kwargs)
														
 
															-        if self.dht and self.experts:
														
 
															+        if self.experts:
														
 
															             self.dht_handler_thread = DHTHandlerThread(
														
 
															                 experts=self.experts,
														
 
															                 dht=self.dht,
														
 
															-                endpoint=self.listen_on,
														
 
															                 update_period=self.update_period,
														
 
															                 daemon=True,
														
 
															             )
														
@@ -92,7 +85,6 @@ class Server(threading.Thread):
 
															     @classmethod
														
 
															     def create(
														
 
															         cls,
														
 
															-        listen_on="0.0.0.0:*",
														
 
															         num_experts: int = None,
														
 
															         expert_uids: str = None,
														
 
															         expert_pattern: str = None,
														
@@ -107,7 +99,6 @@ class Server(threading.Thread):
 
															         min_batch_size=1,
														
 
															         max_batch_size=4096,
														
 
															         device=None,
														
 
															-        no_dht=False,
														
 
															         initial_peers=(),
														
 
															         checkpoint_dir: Optional[Path] = None,
														
 
															         compression=CompressionType.NONE,
														
@@ -115,10 +106,11 @@ class Server(threading.Thread):
 
															         custom_module_path=None,
														
 
															         *,
														
 
															         start: bool,
														
 
															+        **kwargs,
														
 
															     ) -> Server:
														
 
															         """
														
 
															         Instantiate a server with several identical experts. See argparse comments below for details
														
 
															-        :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
														
 
															+
														
 
															         :param num_experts: run this many identical experts
														
 
															         :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
														
 
															            means "sample random experts between myprefix.0.0 and myprefix.255.255;
														
@@ -136,7 +128,6 @@ class Server(threading.Thread):
 
															         :param num_total_steps: the total number of steps for LR schedule
														
 
															         :param clip_grad_norm: maximum gradient norm used for clipping
														
 
															-        :param no_dht: if specified, the server will not be attached to a dht
														
 
															         :param initial_peers: multiaddrs of one or more active DHT peers (if you want to join an existing DHT)
														
 
															         :param checkpoint_dir: directory to save and load expert checkpoints
														
@@ -147,17 +138,15 @@ class Server(threading.Thread):
 
															         :param start: if True, starts server right away and returns when server is ready for requests
														
 
															         :param stats_report_interval: interval between two reports of batch processing performance statistics
														
 
															+        :param kwargs: any other params will be forwarded to DHT upon creation
														
 
															         """
														
 
															         if custom_module_path is not None:
														
 
															             add_custom_models_from_file(custom_module_path)
														
 
															         assert expert_cls in name_to_block
														
 
															-        if no_dht:
														
 
															-            dht = None
														
 
															-        else:
														
 
															-            dht = DHT(initial_peers=initial_peers, start=True)
														
 
															-            visible_maddrs_str = [str(a) for a in dht.get_visible_maddrs()]
														
 
															-            logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
														
 
															+        dht = DHT(initial_peers=initial_peers, start=True, **kwargs)
														
 
															+        visible_maddrs_str = [str(a) for a in dht.get_visible_maddrs()]
														
 
															+        logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
														
 
															         assert (expert_pattern is None and num_experts is None and expert_uids is not None) or (
														
 
															             num_experts is not None and expert_uids is None
														
@@ -221,7 +210,6 @@ class Server(threading.Thread):
 
															         return cls(
														
 
															             dht,
														
 
															             experts,
														
 
															-            listen_on=listen_on,
														
 
															             num_connection_handlers=num_handlers,
														
 
															             device=device,
														
 
															             checkpoint_dir=checkpoint_dir,
														
@@ -234,25 +222,24 @@ class Server(threading.Thread):
 
															         Starts Server in the current thread. Initializes dht if necessary, starts connection handlers,
														
 
															         runs Runtime (self.runtime) to process incoming requests.
														
 
															         """
														
 
															-        logger.info(f"Server started at {self.listen_on}")
														
 
															-        logger.info(f"Got {len(self.experts)} experts:")
														
 
															+        logger.info(f"Server started with {len(self.experts)} experts:")
														
 
															         for expert_name, backend in self.experts.items():
														
 
															             num_parameters = sum(p.numel() for p in backend.expert.parameters() if p.requires_grad)
														
 
															             logger.info(f"{expert_name}: {backend.expert.__class__.__name__}, {num_parameters} parameters")
														
 
															-        if self.dht:
														
 
															-            if not self.dht.is_alive():
														
 
															-                self.dht.run_in_background(await_ready=True)
														
 
															+        if not self.dht.is_alive():
														
 
															+            self.dht.run_in_background(await_ready=True)
														
 
															+
														
 
															+        if self.experts:
														
 
															+            self.dht_handler_thread.start()
														
 
															-            if self.experts:
														
 
															-                self.dht_handler_thread.start()
														
 
															         if self.checkpoint_saver is not None:
														
 
															             self.checkpoint_saver.start()
														
 
															         for process in self.conn_handlers:
														
 
															             if not process.is_alive():
														
 
															                 process.start()
														
 
															-            process.ready.wait()
														
 
															+            process.ready.result()
														
 
															         try:
														
 
															             self.runtime.run()
														
@@ -294,7 +281,7 @@ class Server(threading.Thread):
 
															             process.join()
														
 
															         logger.debug("Connection handlers terminated")
														
 
															-        if self.dht and self.experts:
														
 
															+        if self.experts:
														
 
															             self.dht_handler_thread.stop.set()
														
 
															             self.dht_handler_thread.join()
														
@@ -302,9 +289,8 @@ class Server(threading.Thread):
 
															             self.checkpoint_saver.stop.set()
														
 
															             self.checkpoint_saver.join()
														
 
															-        if self.dht is not None:
														
 
															-            self.dht.shutdown()
														
 
															-            self.dht.join()
														
 
															+        self.dht.shutdown()
														
 
															+        self.dht.join()
														
 
															         logger.debug(f"Shutting down runtime")
														
@@ -313,14 +299,14 @@ class Server(threading.Thread):
 
															 @contextmanager
														
 
															-def background_server(*args, shutdown_timeout=5, **kwargs) -> Tuple[Endpoint, List[Multiaddr]]:
														
 
															-    """A context manager that creates server in a background process, awaits .ready on entry and shuts down on exit"""
														
 
															+def background_server(*args, shutdown_timeout=5, **kwargs) -> PeerInfo:
														
 
															+    """A context manager that creates server in a background , awaits .ready on entry and shuts down on exit"""
														
 
															     pipe, runners_pipe = mp.Pipe(duplex=True)
														
 
															     runner = mp.Process(target=_server_runner, args=(runners_pipe, *args), kwargs=kwargs)
														
 
															     try:
														
 
															         runner.start()
														
 
															         # once the server is ready, runner will send us
														
 
															-        # either (False, exception) or (True, (server.listen_on, dht_maddrs))
														
 
															+        # either (False, exception) or (True, PeerInfo(dht_peer_id, dht_maddrs))
														
 
															         start_ok, data = pipe.recv()
														
 
															         if start_ok:
														
 
															             yield data
														
@@ -344,8 +330,8 @@ def _server_runner(pipe, *args, **kwargs):
 
															         return
														
 
															     try:
														
 
															-        dht_maddrs = server.dht.get_visible_maddrs() if server.dht is not None else None
														
 
															-        pipe.send((True, (server.listen_on, dht_maddrs)))
														
 
															+        dht_maddrs = server.dht.get_visible_maddrs()
														
 
															+        pipe.send((True, PeerInfo(server.dht.peer_id, dht_maddrs)))
														
 
															         pipe.recv()  # wait for shutdown signal
														
 
															     finally:
														
--- a/hivemind/p2p/p2p_daemon.py
+++ b/hivemind/p2p/p2p_daemon.py
@@ -341,6 +341,7 @@ class P2P:
 
															         handler: Callable[[TInputStream, P2PContext], TOutputStream],
														
 
															         input_protobuf_type: Type[Message],
														
 
															         max_prefetch: int = 5,
														
 
															+        balanced: bool = False,
														
 
															     ) -> None:
														
 
															         """
														
 
															         :param max_prefetch: Maximum number of items to prefetch from the request stream.
														
@@ -405,7 +406,7 @@ class P2P:
 
															                 finally:
														
 
															                     processing_task.cancel()
														
 
															-        await self.add_binary_stream_handler(name, _handle_stream)
														
 
															+        await self.add_binary_stream_handler(name, _handle_stream, balanced=balanced)
														
 
															     async def _iterate_protobuf_stream_handler(
														
 
															         self, peer_id: PeerID, name: str, requests: TInputStream, output_protobuf_type: Type[Message]
														
@@ -447,16 +448,19 @@ class P2P:
 
															         *,
														
 
															         stream_input: bool = False,
														
 
															         stream_output: bool = False,
														
 
															+        balanced: bool = False,
														
 
															     ) -> None:
														
 
															         """
														
 
															         :param stream_input: If True, assume ``handler`` to take ``TInputStream``
														
 
															                              (not just ``TInputProtobuf``) as input.
														
 
															         :param stream_output: If True, assume ``handler`` to return ``TOutputStream``
														
 
															                               (not ``Awaitable[TOutputProtobuf]``).
														
 
															+        :param balanced: If True, handler will be balanced on p2pd side between all handlers in python.
														
 
															+                         Default: False
														
 
															         """
														
 
															         if not stream_input and not stream_output:
														
 
															-            await self._add_protobuf_unary_handler(name, handler, input_protobuf_type)
														
 
															+            await self._add_protobuf_unary_handler(name, handler, input_protobuf_type, balanced=balanced)
														
 
															             return
														
 
															         async def _stream_handler(requests: P2P.TInputStream, context: P2PContext) -> P2P.TOutputStream:
														
@@ -469,13 +473,14 @@ class P2P:
 
															             else:
														
 
															                 yield await output
														
 
															-        await self._add_protobuf_stream_handler(name, _stream_handler, input_protobuf_type)
														
 
															+        await self._add_protobuf_stream_handler(name, _stream_handler, input_protobuf_type, balanced=balanced)
														
 
															     async def _add_protobuf_unary_handler(
														
 
															         self,
														
 
															         handle_name: str,
														
 
															         handler: Callable[[TInputProtobuf, P2PContext], Awaitable[TOutputProtobuf]],
														
 
															         input_protobuf_type: Type[Message],
														
 
															+        balanced: bool = False,
														
 
															     ) -> None:
														
 
															         """
														
 
															         Register a request-response (unary) handler. Unary requests and responses
														
@@ -497,7 +502,7 @@ class P2P:
 
															             response = await handler(input_serialized, context)
														
 
															             return response.SerializeToString()
														
 
															-        await self._client.add_unary_handler(handle_name, _unary_handler)
														
 
															+        await self._client.add_unary_handler(handle_name, _unary_handler, balanced=balanced)
														
 
															     async def call_protobuf_handler(
														
 
															         self,
														
@@ -541,10 +546,12 @@ class P2P:
 
															         self._listen_task = asyncio.create_task(listen())
														
 
															-    async def add_binary_stream_handler(self, name: str, handler: p2pclient.StreamHandler) -> None:
														
 
															+    async def add_binary_stream_handler(
														
 
															+        self, name: str, handler: p2pclient.StreamHandler, balanced: bool = False
														
 
															+    ) -> None:
														
 
															         if self._listen_task is None:
														
 
															             self._start_listening()
														
 
															-        await self._client.stream_handler(name, handler)
														
 
															+        await self._client.stream_handler(name, handler, balanced)
														
 
															     async def call_binary_stream_handler(
														
 
															         self, peer_id: PeerID, handler_name: str
														
--- a/hivemind/p2p/p2p_daemon_bindings/control.py
+++ b/hivemind/p2p/p2p_daemon_bindings/control.py
@@ -246,10 +246,10 @@ class ControlClient:
 
															         self._read_task = asyncio.create_task(self._read_from_persistent_conn(reader))
														
 
															         self._write_task = asyncio.create_task(self._write_to_persistent_conn(writer))
														
 
															-    async def add_unary_handler(self, proto: str, handler: TUnaryHandler):
														
 
															+    async def add_unary_handler(self, proto: str, handler: TUnaryHandler, balanced: bool = False):
														
 
															         call_id = uuid4()
														
 
															-        add_unary_handler_req = p2pd_pb.AddUnaryHandlerRequest(proto=proto)
														
 
															+        add_unary_handler_req = p2pd_pb.AddUnaryHandlerRequest(proto=proto, balanced=balanced)
														
 
															         req = p2pd_pb.PersistentConnectionRequest(callId=call_id.bytes, addUnaryHandler=add_unary_handler_req)
														
 
															         if self.unary_handlers.get(proto):
														
@@ -358,11 +358,13 @@ class ControlClient:
 
															         return stream_info, reader, writer
														
 
															-    async def stream_handler(self, proto: str, handler_cb: StreamHandler) -> None:
														
 
															+    async def stream_handler(self, proto: str, handler_cb: StreamHandler, balanced: bool = False) -> None:
														
 
															         reader, writer = await self.daemon_connector.open_connection()
														
 
															         listen_path_maddr_bytes = self.listen_maddr.to_bytes()
														
 
															-        stream_handler_req = p2pd_pb.StreamHandlerRequest(addr=listen_path_maddr_bytes, proto=[proto])
														
 
															+        stream_handler_req = p2pd_pb.StreamHandlerRequest(
														
 
															+            addr=listen_path_maddr_bytes, proto=[proto], balanced=balanced
														
 
															+        )
														
 
															         req = p2pd_pb.Request(type=p2pd_pb.Request.STREAM_HANDLER, streamHandler=stream_handler_req)
														
 
															         await write_pbmsg(writer, req)
														
--- a/hivemind/p2p/p2p_daemon_bindings/datastructures.py
+++ b/hivemind/p2p/p2p_daemon_bindings/datastructures.py
@@ -5,7 +5,7 @@ Author: Kevin Mai-Husan Chia
 
															 """
														
 
															 import hashlib
														
 
															-from typing import Any, Sequence, Union
														
 
															+from typing import Any, Sequence, Tuple, Union
														
 
															 import base58
														
 
															 import multihash
														
@@ -128,6 +128,12 @@ class PeerInfo:
 
															         addrs = [Multiaddr(addr) for addr in peer_info_pb.addrs]
														
 
															         return PeerInfo(peer_id, addrs)
														
 
															+    @classmethod
														
 
															+    def from_tuple(cls, value: Tuple[str, Sequence[str]]) -> "PeerInfo":
														
 
															+        peer_id = PeerID.from_base58(value[0])
														
 
															+        addrs = [Multiaddr(addr) for addr in value[1]]
														
 
															+        return PeerInfo(peer_id, addrs)
														
 
															+
														
 
															     def __str__(self):
														
 
															         return f"{self.peer_id.pretty()} {','.join(str(a) for a in self.addrs)}"
														
--- a/hivemind/p2p/p2p_daemon_bindings/p2pclient.py
+++ b/hivemind/p2p/p2p_daemon_bindings/p2pclient.py
@@ -61,8 +61,8 @@ class Client:
 
															         async with self.control.listen():
														
 
															             yield self
														
 
															-    async def add_unary_handler(self, proto: str, handler: TUnaryHandler):
														
 
															-        await self.control.add_unary_handler(proto, handler)
														
 
															+    async def add_unary_handler(self, proto: str, handler: TUnaryHandler, balanced: bool = False):
														
 
															+        await self.control.add_unary_handler(proto, handler, balanced=balanced)
														
 
															     async def call_unary_handler(self, peer_id: PeerID, proto: str, data: bytes) -> bytes:
														
 
															         return await self.control.call_unary_handler(peer_id, proto, data)
														
@@ -105,11 +105,12 @@ class Client:
 
															         """
														
 
															         return await self.control.stream_open(peer_id=peer_id, protocols=protocols)
														
 
															-    async def stream_handler(self, proto: str, handler_cb: StreamHandler) -> None:
														
 
															+    async def stream_handler(self, proto: str, handler_cb: StreamHandler, balanced: bool = False) -> None:
														
 
															         """
														
 
															         Register a stream handler
														
 
															         :param proto: protocols that handler serves
														
 
															         :param handler_cb: handler callback
														
 
															+        :param balanced: flag if stream handler should be balanced on p2pd side. Default: False.
														
 
															         :return:
														
 
															         """
														
 
															-        await self.control.stream_handler(proto=proto, handler_cb=handler_cb)
														
 
															+        await self.control.stream_handler(proto=proto, handler_cb=handler_cb, balanced=balanced)
														
--- a/hivemind/p2p/servicer.py
+++ b/hivemind/p2p/servicer.py
@@ -104,11 +104,12 @@ class ServicerBase:
 
															         caller.__name__ = handler.method_name
														
 
															         return caller
														
 
															-    async def add_p2p_handlers(self, p2p: P2P, wrapper: Any = None, *, namespace: Optional[str] = None) -> None:
														
 
															+    async def add_p2p_handlers(
														
 
															+        self, p2p: P2P, wrapper: Any = None, *, namespace: Optional[str] = None, balanced: bool = False
														
 
															+    ) -> None:
														
 
															         self._collect_rpc_handlers()
														
 
															         servicer = self if wrapper is None else wrapper
														
 
															-
														
 
															         await asyncio.gather(
														
 
															             *[
														
 
															                 p2p.add_protobuf_handler(
														
@@ -117,6 +118,7 @@ class ServicerBase:
 
															                     handler.request_type,
														
 
															                     stream_input=handler.stream_input,
														
 
															                     stream_output=handler.stream_output,
														
 
															+                    balanced=balanced,
														
 
															                 )
														
 
															                 for handler in self._rpc_handlers
														
 
															             ]
														
--- a/hivemind/proto/p2pd.proto
+++ b/hivemind/proto/p2pd.proto
@@ -90,6 +90,7 @@ message StreamOpenRequest {
 
															 message StreamHandlerRequest {
														
 
															   required bytes addr = 1;
														
 
															   repeated string proto = 2;
														
 
															+  required bool balanced = 3;
														
 
															 }
														
 
															 message ErrorResponse {
														
@@ -201,6 +202,7 @@ message CallUnaryResponse {
 
															 message AddUnaryHandlerRequest {
														
 
															   required string proto = 1;
														
 
															+  required bool balanced = 2;
														
 
															 }
														
 
															 message DaemonError {
														
--- a/hivemind/utils/__init__.py
+++ b/hivemind/utils/__init__.py
@@ -1,5 +1,4 @@
 
															 from hivemind.utils.asyncio import *
														
 
															-from hivemind.utils.grpc import *
														
 
															 from hivemind.utils.limits import increase_file_limit
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															 from hivemind.utils.mpfuture import *
														
@@ -7,5 +6,6 @@ from hivemind.utils.nested import *
 
															 from hivemind.utils.networking import *
														
 
															 from hivemind.utils.performance_ema import PerformanceEMA
														
 
															 from hivemind.utils.serializer import MSGPackSerializer, SerializerBase
														
 
															+from hivemind.utils.streaming import combine_from_streaming, split_for_streaming
														
 
															 from hivemind.utils.tensor_descr import BatchTensorDescriptor, TensorDescriptor
														
 
															 from hivemind.utils.timed_storage import *
														
--- a/hivemind/utils/asyncio.py
+++ b/hivemind/utils/asyncio.py
@@ -2,7 +2,7 @@ import asyncio
 
															 import concurrent.futures
														
 
															 from concurrent.futures import ThreadPoolExecutor
														
 
															 from contextlib import AbstractAsyncContextManager, AbstractContextManager, asynccontextmanager
														
 
															-from typing import AsyncIterable, AsyncIterator, Awaitable, Callable, ContextManager, Optional, Tuple, TypeVar, Union
														
 
															+from typing import AsyncIterable, AsyncIterator, Awaitable, Callable, Iterable, Optional, Tuple, TypeVar, Union
														
 
															 import uvloop
														
@@ -29,6 +29,12 @@ async def anext(aiter: AsyncIterator[T]) -> Union[T, StopAsyncIteration]:
 
															     return await aiter.__anext__()
														
 
															+async def iter_as_aiter(iterable: Iterable[T]) -> AsyncIterator[T]:
														
 
															+    """create an asynchronous iterator from single iterable"""
														
 
															+    for elem in iterable:
														
 
															+        yield elem
														
 
															+
														
 
															+
														
 
															 async def as_aiter(*args: T) -> AsyncIterator[T]:
														
 
															     """create an asynchronous iterator from a sequence of values"""
														
 
															     for arg in args:
														
--- a/hivemind/utils/grpc.py
+++ b/hivemind/utils/grpc.py
@@ -1,210 +0,0 @@
 
															-"""
														
 
															-Utilities for running GRPC services: compile protobuf, patch legacy versions, etc
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-import os
														
 
															-import threading
														
 
															-from typing import Any, Dict, Iterable, Iterator, NamedTuple, Optional, Tuple, Type, TypeVar, Union
														
 
															-
														
 
															-import grpc
														
 
															-
														
 
															-from hivemind.proto import runtime_pb2
														
 
															-from hivemind.utils.logging import get_logger
														
 
															-from hivemind.utils.networking import Endpoint
														
 
															-from hivemind.utils.timed_storage import TimedStorage, ValueWithExpiration, get_dht_time
														
 
															-
														
 
															-logger = get_logger(__name__)
														
 
															-
														
 
															-Stub = TypeVar("Stub")
														
 
															-
														
 
															-GRPC_KEEPALIVE_OPTIONS = (
														
 
															-    ("grpc.keepalive_time_ms", 60 * 1000),
														
 
															-    ("grpc.keepalive_timeout_ms", 60 * 1000),
														
 
															-    ("grpc.keepalive_permit_without_calls", True),
														
 
															-    ("grpc.http2.max_pings_without_data", 0),
														
 
															-    ("grpc.http2.min_time_between_pings_ms", 30 * 1000),
														
 
															-    ("grpc.http2.min_ping_interval_without_data_ms", 10 * 1000),
														
 
															-)
														
 
															-
														
 
															-
														
 
															-class ChannelInfo(NamedTuple):
														
 
															-    target: Endpoint
														
 
															-    aio: bool
														
 
															-    options: Tuple[Tuple[str, str], ...]
														
 
															-    credentials: Optional[grpc.ChannelCredentials]
														
 
															-    compression: Optional[grpc.Compression]
														
 
															-
														
 
															-
														
 
															-class ChannelCache(TimedStorage[ChannelInfo, Tuple[Union[grpc.Channel, grpc.aio.Channel], Dict]]):
														
 
															-    """
														
 
															-    A process-wide cache of gRPC channels, supports both normal and aio channels, secure/insecure channels, etc
														
 
															-    Based on grpcio internal channel cache by Richard Belleville and Lidi Zheng (thanks!)
														
 
															-    Unlike TimedStorage, ChannelCache actively evicts stale channels even if the cache is not accessed
														
 
															-    Unlike grpc._simple_stubs.ChannelCache, this implementation supports aio and does not forcibly close active channels
														
 
															-    """
														
 
															-
														
 
															-    MAXIMUM_CHANNELS = int(os.environ.get("GRPC_PYTHON_MANAGED_CHANNEL_MAXIMUM", 4096))
														
 
															-    EVICTION_PERIOD_SECONDS = float(os.environ.get("GRPC_PYTHON_MANAGED_CHANNEL_EVICTION_SECONDS", 10 * 60))
														
 
															-    logger.debug(f"Eviction period = {EVICTION_PERIOD_SECONDS}s, max channels = {MAXIMUM_CHANNELS}")
														
 
															-
														
 
															-    _singleton: Optional[ChannelCache] = None
														
 
															-    _singleton_pid: int = os.getpid()
														
 
															-    _lock: threading.RLock = threading.RLock()
														
 
															-    _update_eviction_evt: threading.Event = threading.Event()
														
 
															-
														
 
															-    def __init__(self, _created_as_singleton=False):
														
 
															-        assert _created_as_singleton, f"Please use {self.__class__.__name__}.get_singleton()"
														
 
															-        super().__init__(maxsize=self.MAXIMUM_CHANNELS)
														
 
															-        self._is_active = True
														
 
															-        self._nearest_expiration_time = float("inf")
														
 
															-        self._eviction_thread = threading.Thread(target=self._evict_stale_channels_in_background, daemon=True)
														
 
															-        self._eviction_thread.start()
														
 
															-
														
 
															-    @classmethod
														
 
															-    def get_singleton(cls):
														
 
															-        """Get or create the channel cache for the current process"""
														
 
															-        with cls._lock:
														
 
															-            if cls._singleton is None or cls._singleton_pid != os.getpid():
														
 
															-                if cls._singleton is not None:
														
 
															-                    cls._singleton._stop_background_thread()
														
 
															-                cls._singleton, cls._singleton_pid = cls(_created_as_singleton=True), os.getpid()
														
 
															-            return cls._singleton
														
 
															-
														
 
															-    @classmethod
														
 
															-    def get_stub(
														
 
															-        cls,
														
 
															-        target: Endpoint,
														
 
															-        stub_type: Type[Stub],
														
 
															-        *,
														
 
															-        aio: bool,
														
 
															-        options: Tuple[Tuple[str, Any]] = (),
														
 
															-        channel_credentials: Optional[grpc.ChannelCredentials] = None,
														
 
															-        compression: Optional[grpc.Compression] = None,
														
 
															-    ) -> Stub:
														
 
															-        """
														
 
															-        Create a grpc channel with given options or reuse pre-existing one
														
 
															-
														
 
															-        :param target: the recipient's address and port
														
 
															-        :param stub_type: a gRPC stub (client) to be instantiated
														
 
															-        :param aio: if True, returns grpc.Channel, otherwise returns grpc.aio.Channel
														
 
															-        :param options: see https://grpc.github.io/grpc/core/group__grpc__arg__keys.html
														
 
															-        :param channel_credentials: if specified, create a secure channel usin these credentials (default = insecure)
														
 
															-        :param compression: see https://github.com/grpc/grpc/tree/master/examples/python/compression
														
 
															-        """
														
 
															-        cache = cls.get_singleton()
														
 
															-        with cls._lock:
														
 
															-            key = ChannelInfo(target, aio, tuple(options), channel_credentials, compression)
														
 
															-            entry: ValueWithExpiration = super(cls, cache).get(key)
														
 
															-
														
 
															-            if entry is not None:
														
 
															-                channel, stubs = entry.value
														
 
															-            else:
														
 
															-                channel = cls._create_channel(*key)
														
 
															-                stubs = {}
														
 
															-
														
 
															-            channel._channel.check_connectivity_state(True)
														
 
															-
														
 
															-            if stub_type not in stubs:
														
 
															-                stubs[stub_type] = stub_type(channel)
														
 
															-
														
 
															-            # either cache channel or update expiration of an existing channel
														
 
															-            expiration_time = get_dht_time() + cls.EVICTION_PERIOD_SECONDS
														
 
															-            super(cls, cache).store(key, (channel, stubs), expiration_time)
														
 
															-
														
 
															-            if expiration_time < cache._nearest_expiration_time:
														
 
															-                cache._nearest_expiration_time = expiration_time
														
 
															-                cls._update_eviction_evt.set()
														
 
															-
														
 
															-            return stubs[stub_type]
														
 
															-
														
 
															-    @classmethod
														
 
															-    def _create_channel(
														
 
															-        cls,
														
 
															-        target: Endpoint,
														
 
															-        aio: bool,
														
 
															-        extra_options: Tuple[Tuple[str, Any], ...],
														
 
															-        channel_credentials: Optional[grpc.ChannelCredentials],
														
 
															-        compression: Optional[grpc.Compression],
														
 
															-    ) -> Union[grpc.Channel, grpc.aio.Channel]:
														
 
															-        namespace = grpc.aio if aio else grpc
														
 
															-
														
 
															-        options = extra_options + GRPC_KEEPALIVE_OPTIONS
														
 
															-
														
 
															-        if channel_credentials is None:
														
 
															-            logger.debug(
														
 
															-                f"Creating insecure {namespace} channel with options '{options}' " f"and compression '{compression}'"
														
 
															-            )
														
 
															-            return namespace.insecure_channel(target, options=options, compression=compression)
														
 
															-        else:
														
 
															-            logger.debug(
														
 
															-                f"Creating secure {namespace} channel with credentials '{channel_credentials}', "
														
 
															-                f"options '{options}' and compression '{compression}'"
														
 
															-            )
														
 
															-            return namespace.secure_channel(
														
 
															-                target, credentials=channel_credentials, options=options, compression=compression
														
 
															-            )
														
 
															-
														
 
															-    def _evict_stale_channels_in_background(self):
														
 
															-        while self._is_active:
														
 
															-            now = get_dht_time()
														
 
															-            time_to_wait = max(0.0, self._nearest_expiration_time - now)
														
 
															-            interrupted_early = self._update_eviction_evt.wait(time_to_wait if time_to_wait != float("inf") else None)
														
 
															-            if interrupted_early:
														
 
															-                self._update_eviction_evt.clear()
														
 
															-                continue
														
 
															-
														
 
															-            with self._lock:
														
 
															-                self._remove_outdated()
														
 
															-                _, entry = super().top()
														
 
															-                self._nearest_expiration_time = entry.expiration_time if entry is not None else float("inf")
														
 
															-
														
 
															-    def _stop_background_thread(self):
														
 
															-        with self._lock:
														
 
															-            self._is_active = False
														
 
															-            self._update_eviction_evt.set()
														
 
															-
														
 
															-    def store(self, *args, **kwargs) -> ValueError:
														
 
															-        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
														
 
															-
														
 
															-    def get(self, *args, **kwargs) -> ValueError:
														
 
															-        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
														
 
															-
														
 
															-    def top(self) -> ValueError:
														
 
															-        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
														
 
															-
														
 
															-
														
 
															-STREAMING_CHUNK_SIZE_BYTES = 2**16
														
 
															-
														
 
															-
														
 
															-def split_for_streaming(
														
 
															-    serialized_tensor: runtime_pb2.Tensor,
														
 
															-    chunk_size_bytes: int = STREAMING_CHUNK_SIZE_BYTES,
														
 
															-) -> Iterator[runtime_pb2.Tensor]:
														
 
															-    """Split serialized_tensor into multiple chunks for gRPC streaming"""
														
 
															-    buffer = memoryview(serialized_tensor.buffer)
														
 
															-    num_chunks = len(range(0, len(buffer), chunk_size_bytes))
														
 
															-    yield runtime_pb2.Tensor(
														
 
															-        compression=serialized_tensor.compression,
														
 
															-        buffer=buffer[:chunk_size_bytes].tobytes(),
														
 
															-        chunks=num_chunks,
														
 
															-        size=serialized_tensor.size,
														
 
															-        dtype=serialized_tensor.dtype,
														
 
															-        requires_grad=serialized_tensor.requires_grad,
														
 
															-    )
														
 
															-    for chunk_start in range(chunk_size_bytes, len(buffer), chunk_size_bytes):
														
 
															-        yield runtime_pb2.Tensor(buffer=buffer[chunk_start : chunk_start + chunk_size_bytes].tobytes())
														
 
															-
														
 
															-
														
 
															-def combine_from_streaming(stream: Iterable[runtime_pb2.Tensor]) -> runtime_pb2.Tensor:
														
 
															-    """Restore a result of split_into_chunks into a single serialized tensor"""
														
 
															-    stream = iter(stream)
														
 
															-    first_chunk = next(stream)
														
 
															-    serialized_tensor = runtime_pb2.Tensor()
														
 
															-    serialized_tensor.CopyFrom(first_chunk)
														
 
															-    buffer_chunks = [first_chunk.buffer]
														
 
															-    for tensor_part in stream:
														
 
															-        buffer_chunks.append(tensor_part.buffer)
														
 
															-    serialized_tensor.buffer = b"".join(buffer_chunks)
														
 
															-    return serialized_tensor
														
--- a/hivemind/utils/networking.py
+++ b/hivemind/utils/networking.py
@@ -1,35 +1,13 @@
 
															 import socket
														
 
															 from contextlib import closing
														
 
															 from ipaddress import ip_address
														
 
															-from typing import Optional, Sequence
														
 
															+from typing import Sequence
														
 
															 from multiaddr import Multiaddr
														
 
															-Hostname, Port = str, int  # flavour types
														
 
															-Endpoint = str  # e.g. 1.2.3.4:1337 or [2a21:6с8:b192:2105]:8888, https://networkengineering.stackexchange.com/a/9435
														
 
															 LOCALHOST = "127.0.0.1"
														
 
															-def get_port(endpoint: Endpoint) -> Optional[Port]:
														
 
															-    """get port or None if port is undefined"""
														
 
															-    # TODO: find a standard way to get port, make sure it works in malformed ports
														
 
															-    try:
														
 
															-        return int(endpoint[endpoint.rindex(":") + 1 :], base=10)
														
 
															-    except ValueError:  # :* or not specified
														
 
															-        return None
														
 
															-
														
 
															-
														
 
															-def replace_port(endpoint: Endpoint, new_port: Port) -> Endpoint:
														
 
															-    assert endpoint.endswith(":*") or get_port(endpoint) is not None, endpoint
														
 
															-    return f"{endpoint[:endpoint.rindex(':')]}:{new_port}"
														
 
															-
														
 
															-
														
 
															-def strip_port(endpoint: Endpoint) -> Hostname:
														
 
															-    """Removes port from the end of endpoint. If port is not specified, does nothing"""
														
 
															-    maybe_port = endpoint[endpoint.rindex(":") + 1 :]
														
 
															-    return endpoint[: endpoint.rindex(":")] if maybe_port.isdigit() or maybe_port == "*" else endpoint
														
 
															-
														
 
															-
														
 
															 def get_free_port(params=(socket.AF_INET, socket.SOCK_STREAM), opt=(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)):
														
 
															     """
														
 
															     Finds a tcp port that can be occupied with a socket with *params and use *opt options.
														
@@ -48,7 +26,7 @@ def get_free_port(params=(socket.AF_INET, socket.SOCK_STREAM), opt=(socket.SOL_S
 
															 def choose_ip_address(
														
 
															     maddrs: Sequence[Multiaddr], prefer_global: bool = True, protocol_priority: Sequence[str] = ("ip4", "ip6")
														
 
															-) -> Hostname:
														
 
															+) -> str:
														
 
															     """
														
 
															     Currently, some components of hivemind are not converted to work over libp2p and use classical networking.
														
 
															     To allow other peers reach a server when needed, these components announce a machine's IP address.
														
--- a/hivemind/utils/streaming.py
+++ b/hivemind/utils/streaming.py
@@ -0,0 +1,49 @@
 
															+"""
														
 
															+Utilities for streaming tensors
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+from typing import Iterable, Iterator, TypeVar
														
 
															+
														
 
															+from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils.logging import get_logger
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+STREAMING_CHUNK_SIZE_BYTES = 2**16
														
 
															+
														
 
															+
														
 
															+def split_for_streaming(
														
 
															+    serialized_tensor: runtime_pb2.Tensor,
														
 
															+    chunk_size_bytes: int = STREAMING_CHUNK_SIZE_BYTES,
														
 
															+) -> Iterator[runtime_pb2.Tensor]:
														
 
															+    """Split serialized_tensor into multiple chunks for streaming"""
														
 
															+    buffer = memoryview(serialized_tensor.buffer)
														
 
															+    num_chunks = len(range(0, len(buffer), chunk_size_bytes))
														
 
															+    yield runtime_pb2.Tensor(
														
 
															+        compression=serialized_tensor.compression,
														
 
															+        buffer=buffer[:chunk_size_bytes].tobytes(),
														
 
															+        chunks=num_chunks,
														
 
															+        size=serialized_tensor.size,
														
 
															+        dtype=serialized_tensor.dtype,
														
 
															+        requires_grad=serialized_tensor.requires_grad,
														
 
															+    )
														
 
															+    for chunk_start in range(chunk_size_bytes, len(buffer), chunk_size_bytes):
														
 
															+        yield runtime_pb2.Tensor(buffer=buffer[chunk_start : chunk_start + chunk_size_bytes].tobytes())
														
 
															+
														
 
															+
														
 
															+def combine_from_streaming(stream: Iterable[runtime_pb2.Tensor]) -> runtime_pb2.Tensor:
														
 
															+    """Restore a result of split_into_chunks into a single serialized tensor"""
														
 
															+    stream = iter(stream)
														
 
															+    first_chunk = next(stream)
														
 
															+    serialized_tensor = runtime_pb2.Tensor()
														
 
															+    serialized_tensor.CopyFrom(first_chunk)
														
 
															+    buffer_chunks = [first_chunk.buffer]
														
 
															+    for tensor_part in stream:
														
 
															+        buffer_chunks.append(tensor_part.buffer)
														
 
															+    serialized_tensor.buffer = b"".join(buffer_chunks)
														
 
															+    return serialized_tensor
														
 
															+
														
 
															+
														
 
															+StreamMessage = TypeVar("StreamMessage")
														
--- a/setup.py
+++ b/setup.py
@@ -13,14 +13,14 @@ from setuptools import find_packages, setup
 
															 from setuptools.command.build_py import build_py
														
 
															 from setuptools.command.develop import develop
														
 
															-P2PD_VERSION = "v0.3.8"
														
 
															+P2PD_VERSION = "v0.3.9"
														
 
															 P2PD_SOURCE_URL = f"https://github.com/learning-at-home/go-libp2p-daemon/archive/refs/tags/{P2PD_VERSION}.tar.gz"
														
 
															 P2PD_BINARY_URL = f"https://github.com/learning-at-home/go-libp2p-daemon/releases/download/{P2PD_VERSION}/"
														
 
															 # The value is sha256 of the binary from the release page
														
 
															 EXECUTABLES = {
														
 
															-    "p2pd": "785058526d993f699c674dc2f9b66d565a52315a18b79b629998fab3ebd8e20f",
														
 
															+    "p2pd": "8f9434f4717f6e851430f75f07e283d5ddeb2c7cde1b3648e677d813703f4e40",
														
 
															 }
														
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -20,6 +20,7 @@ from hivemind.compression import (
 
															 )
														
 
															 from hivemind.compression.adaptive import AdaptiveCompressionBase
														
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															+from hivemind.utils.streaming import combine_from_streaming, split_for_streaming
														
 
															 from test_utils.dht_swarms import launch_dht_instances
														
@@ -47,9 +48,9 @@ def test_tensor_compression(size=(128, 128, 64), alpha=5e-08, beta=0.0008):
 
															 def test_serialize_tensor():
														
 
															     def _check(tensor, compression, rtol=1e-5, atol=1e-8, chunk_size=30 * 1024):
														
 
															         serialized_tensor = serialize_torch_tensor(tensor, compression)
														
 
															-        chunks = list(hivemind.split_for_streaming(serialized_tensor, chunk_size))
														
 
															+        chunks = list(split_for_streaming(serialized_tensor, chunk_size))
														
 
															         assert len(chunks) == (len(serialized_tensor.buffer) - 1) // chunk_size + 1
														
 
															-        restored = hivemind.combine_from_streaming(chunks)
														
 
															+        restored = combine_from_streaming(chunks)
														
 
															         assert torch.allclose(deserialize_torch_tensor(restored), tensor, rtol=rtol, atol=atol)
														
 
															     tensor = torch.randn(512, 12288)
														
--- a/tests/test_connection_handler.py
+++ b/tests/test_connection_handler.py
@@ -0,0 +1,192 @@
 
															+from __future__ import annotations
														
 
															+
														
 
															+import asyncio
														
 
															+import math
														
 
															+from typing import Any, Dict
														
 
															+
														
 
															+import pytest
														
 
															+import torch
														
 
															+
														
 
															+from hivemind.compression import deserialize_tensor_stream, deserialize_torch_tensor, serialize_torch_tensor
														
 
															+from hivemind.dht import DHT
														
 
															+from hivemind.moe.server.connection_handler import ConnectionHandler
														
 
															+from hivemind.moe.server.expert_backend import ExpertBackend
														
 
															+from hivemind.moe.server.task_pool import TaskPool
														
 
															+from hivemind.p2p.p2p_daemon_bindings.control import DEFAULT_MAX_MSG_SIZE, P2PHandlerError
														
 
															+from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils.asyncio import amap_in_executor, iter_as_aiter
														
 
															+from hivemind.utils.serializer import MSGPackSerializer
														
 
															+from hivemind.utils.streaming import split_for_streaming
														
 
															+from hivemind.utils.tensor_descr import BatchTensorDescriptor
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+@pytest.mark.asyncio
														
 
															+async def test_connection_handler_info():
														
 
															+    handler = ConnectionHandler(
														
 
															+        DHT(start=True),
														
 
															+        dict(expert1=DummyExpertBackend("expert1", k=1), expert2=DummyExpertBackend("expert2", k=2)),
														
 
															+    )
														
 
															+    handler.start()
														
 
															+
														
 
															+    client_dht = DHT(start=True, client_mode=True, initial_peers=handler.dht.get_visible_maddrs())
														
 
															+    client_stub = ConnectionHandler.get_stub(await client_dht.replicate_p2p(), handler.dht.peer_id)
														
 
															+
														
 
															+    # info
														
 
															+    response = await client_stub.rpc_info(runtime_pb2.ExpertUID(uid="expert1"))
														
 
															+    assert MSGPackSerializer.loads(response.serialized_info) == dict(name="expert1")
														
 
															+
														
 
															+    response = await client_stub.rpc_info(runtime_pb2.ExpertUID(uid="expert2"))
														
 
															+    assert MSGPackSerializer.loads(response.serialized_info) == dict(name="expert2")
														
 
															+
														
 
															+    with pytest.raises(P2PHandlerError):
														
 
															+        await client_stub.rpc_info(runtime_pb2.ExpertUID(uid="expert999"))
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+@pytest.mark.asyncio
														
 
															+async def test_connection_handler_forward():
														
 
															+    handler = ConnectionHandler(
														
 
															+        DHT(start=True),
														
 
															+        dict(expert1=DummyExpertBackend("expert1", k=1), expert2=DummyExpertBackend("expert2", k=2)),
														
 
															+    )
														
 
															+    handler.start()
														
 
															+
														
 
															+    client_dht = DHT(start=True, client_mode=True, initial_peers=handler.dht.get_visible_maddrs())
														
 
															+    client_stub = ConnectionHandler.get_stub(await client_dht.replicate_p2p(), handler.dht.peer_id)
														
 
															+
														
 
															+    inputs = torch.randn(1, 2)
														
 
															+    inputs_long = torch.randn(2**21, 2)
														
 
															+
														
 
															+    # forward unary
														
 
															+    response = await client_stub.rpc_forward(
														
 
															+        runtime_pb2.ExpertRequest(uid="expert1", tensors=[serialize_torch_tensor(inputs)])
														
 
															+    )
														
 
															+    outputs = deserialize_torch_tensor(response.tensors[0])
														
 
															+    assert len(response.tensors) == 1
														
 
															+    assert torch.allclose(outputs, inputs * 1)
														
 
															+
														
 
															+    response = await client_stub.rpc_forward(
														
 
															+        runtime_pb2.ExpertRequest(uid="expert2", tensors=[serialize_torch_tensor(inputs)])
														
 
															+    )
														
 
															+    outputs = deserialize_torch_tensor(response.tensors[0])
														
 
															+    assert len(response.tensors) == 1
														
 
															+    assert torch.allclose(outputs, inputs * 2)
														
 
															+
														
 
															+    # forward streaming
														
 
															+    split = (
														
 
															+        p for t in [serialize_torch_tensor(inputs_long)] for p in split_for_streaming(t, chunk_size_bytes=2**16)
														
 
															+    )
														
 
															+    output_generator = await client_stub.rpc_forward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor_part: runtime_pb2.ExpertRequest(uid="expert2", tensors=[tensor_part]),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+    outputs_list = [part async for part in output_generator]
														
 
															+    assert len(outputs_list) == math.ceil(inputs_long.numel() * 4 / DEFAULT_MAX_MSG_SIZE)
														
 
															+
														
 
															+    results = await deserialize_tensor_stream(amap_in_executor(lambda r: r.tensors, iter_as_aiter(outputs_list)))
														
 
															+    assert len(results) == 1
														
 
															+    assert torch.allclose(results[0], inputs_long * 2)
														
 
															+
														
 
															+    # forward errors
														
 
															+    with pytest.raises(P2PHandlerError):
														
 
															+        # no such expert: fails with P2PHandlerError KeyError('expert3')
														
 
															+        await client_stub.rpc_forward(
														
 
															+            runtime_pb2.ExpertRequest(uid="expert3", tensors=[serialize_torch_tensor(inputs)])
														
 
															+        )
														
 
															+
														
 
															+    with pytest.raises(P2PHandlerError):
														
 
															+        # bad input shape: P2PHandlerError("AssertionError") raised by DummyPool.submit_task
														
 
															+        await client_stub.rpc_forward(
														
 
															+            runtime_pb2.ExpertRequest(uid="expert1", tensors=[serialize_torch_tensor(torch.arange(5))])
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+@pytest.mark.asyncio
														
 
															+async def test_connection_handler_backward():
														
 
															+    handler = ConnectionHandler(
														
 
															+        DHT(start=True),
														
 
															+        dict(expert1=DummyExpertBackend("expert1", k=1), expert2=DummyExpertBackend("expert2", k=2)),
														
 
															+    )
														
 
															+    handler.start()
														
 
															+
														
 
															+    client_dht = DHT(start=True, client_mode=True, initial_peers=handler.dht.get_visible_maddrs())
														
 
															+    client_stub = ConnectionHandler.get_stub(await client_dht.replicate_p2p(), handler.dht.peer_id)
														
 
															+
														
 
															+    inputs = torch.randn(1, 2)
														
 
															+    inputs_long = torch.randn(2**21, 2)
														
 
															+
														
 
															+    # backward unary
														
 
															+    response = await client_stub.rpc_backward(
														
 
															+        runtime_pb2.ExpertRequest(
														
 
															+            uid="expert2", tensors=[serialize_torch_tensor(inputs * -1), serialize_torch_tensor(inputs)]
														
 
															+        )
														
 
															+    )
														
 
															+    outputs = deserialize_torch_tensor(response.tensors[0])
														
 
															+    assert len(response.tensors) == 1
														
 
															+    assert torch.allclose(outputs, inputs * -2)
														
 
															+
														
 
															+    # backward streaming
														
 
															+    split = (
														
 
															+        p
														
 
															+        for t in [serialize_torch_tensor(inputs_long * 3), serialize_torch_tensor(inputs_long * 0)]
														
 
															+        for p in split_for_streaming(t, chunk_size_bytes=2**16)
														
 
															+    )
														
 
															+    output_generator = await client_stub.rpc_backward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor_part: runtime_pb2.ExpertRequest(uid="expert1", tensors=[tensor_part]),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+    results = await deserialize_tensor_stream(amap_in_executor(lambda r: r.tensors, output_generator))
														
 
															+    assert len(results) == 1
														
 
															+    assert torch.allclose(results[0], inputs_long * 3)
														
 
															+
														
 
															+    # backward errors
														
 
															+    with pytest.raises(P2PHandlerError):
														
 
															+        # bad input schema: fails with P2PHandlerError IndexError('tuple index out of range')
														
 
															+        await client_stub.rpc_backward(runtime_pb2.ExpertRequest(uid="expert2", tensors=[]))
														
 
															+
														
 
															+    with pytest.raises(P2PHandlerError):
														
 
															+        # backward fails: empty stream
														
 
															+        output_generator = await client_stub.rpc_backward_stream(
														
 
															+            amap_in_executor(
														
 
															+                lambda tensor_part: runtime_pb2.ExpertRequest(uid="expert2", tensors=[tensor_part]),
														
 
															+                iter_as_aiter([]),
														
 
															+            ),
														
 
															+        )
														
 
															+        results = await deserialize_tensor_stream(amap_in_executor(lambda r: r.tensors, output_generator))
														
 
															+        assert len(results) == 1
														
 
															+        assert torch.allclose(results[0], inputs_long * 3)
														
 
															+
														
 
															+    # check that handler did not crash after failed request
														
 
															+    await client_stub.rpc_forward(runtime_pb2.ExpertRequest(uid="expert1", tensors=[serialize_torch_tensor(inputs)]))
														
 
															+
														
 
															+    handler.terminate()
														
 
															+    handler.join()
														
 
															+
														
 
															+
														
 
															+class DummyPool(TaskPool):
														
 
															+    def __init__(self, k: float):
														
 
															+        self.k = k
														
 
															+
														
 
															+    async def submit_task(self, *inputs: torch.Tensor):
														
 
															+        await asyncio.sleep(0.01)
														
 
															+        assert inputs[0].shape[-1] == 2
														
 
															+        return [inputs[0] * self.k]
														
 
															+
														
 
															+
														
 
															+class DummyExpertBackend(ExpertBackend):
														
 
															+    def __init__(self, name: str, k: float):
														
 
															+        self.name = name
														
 
															+        self.outputs_schema = [BatchTensorDescriptor.from_tensor(torch.randn(1, 2))]
														
 
															+        self.grad_inputs_schema = [BatchTensorDescriptor.from_tensor(torch.randn(1, 2))]
														
 
															+        self.forward_pool = DummyPool(k)
														
 
															+        self.backward_pool = DummyPool(k)
														
 
															+
														
 
															+    def get_info(self) -> Dict[str, Any]:
														
 
															+        """Get expert parameters and stats. Used by RemoteExpert to check shapes and for DMoE orchestration."""
														
 
															+        return dict(name=self.name)
														
--- a/tests/test_custom_experts.py
+++ b/tests/test_custom_experts.py
@@ -3,7 +3,8 @@ import os
 
															 import pytest
														
 
															 import torch
														
 
															-from hivemind import RemoteExpert
														
 
															+from hivemind.dht import DHT
														
 
															+from hivemind.moe.client.expert import RemoteExpertInfo, create_remote_experts
														
 
															 from hivemind.moe.server import background_server
														
 
															 CUSTOM_EXPERTS_PATH = os.path.join(os.path.dirname(__file__), "test_utils", "custom_networks.py")
														
@@ -17,11 +18,16 @@ def test_custom_expert(hid_dim=16):
 
															         device="cpu",
														
 
															         hidden_dim=hid_dim,
														
 
															         num_handlers=2,
														
 
															-        no_dht=True,
														
 
															         custom_module_path=CUSTOM_EXPERTS_PATH,
														
 
															-    ) as (server_endpoint, _):
														
 
															-        expert0 = RemoteExpert("expert.0", server_endpoint)
														
 
															-        expert1 = RemoteExpert("expert.1", server_endpoint)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        expert0, expert1 = create_remote_experts(
														
 
															+            [
														
 
															+                RemoteExpertInfo(uid="expert.0", peer_info=server_peer_info),
														
 
															+                RemoteExpertInfo(uid="expert.1", peer_info=server_peer_info),
														
 
															+            ],
														
 
															+            dht=dht,
														
 
															+        )
														
 
															         for batch_size in (1, 4):
														
 
															             batch = torch.randn(batch_size, hid_dim)
														
@@ -43,11 +49,16 @@ def test_multihead_expert(hid_dim=16):
 
															         device="cpu",
														
 
															         hidden_dim=hid_dim,
														
 
															         num_handlers=2,
														
 
															-        no_dht=True,
														
 
															         custom_module_path=CUSTOM_EXPERTS_PATH,
														
 
															-    ) as (server_endpoint, _):
														
 
															-        expert0 = RemoteExpert("expert.0", server_endpoint)
														
 
															-        expert1 = RemoteExpert("expert.1", server_endpoint)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        expert0, expert1 = create_remote_experts(
														
 
															+            [
														
 
															+                RemoteExpertInfo(uid="expert.0", peer_info=server_peer_info),
														
 
															+                RemoteExpertInfo(uid="expert.1", peer_info=server_peer_info),
														
 
															+            ],
														
 
															+            dht=dht,
														
 
															+        )
														
 
															         for batch_size in (1, 4):
														
 
															             batch = (
														
--- a/tests/test_dht_experts.py
+++ b/tests/test_dht_experts.py
@@ -6,11 +6,11 @@ import numpy as np
 
															 import pytest
														
 
															 import hivemind
														
 
															-from hivemind import LOCALHOST
														
 
															 from hivemind.dht import DHTNode
														
 
															 from hivemind.moe.client.beam_search import MoEBeamSearcher
														
 
															 from hivemind.moe.server import declare_experts, get_experts
														
 
															 from hivemind.moe.server.expert_uid import UidEndpoint, is_valid_prefix, is_valid_uid, split_uid
														
 
															+from hivemind.p2p import PeerInfo
														
 
															 @pytest.mark.forked
														
@@ -25,17 +25,18 @@ def test_store_get_experts(n_peers=10):
 
															     expert_uids = [f"my_expert.{i}" for i in range(50)]
														
 
															     batch_size = 10
														
 
															     for batch_start in range(0, len(expert_uids), batch_size):
														
 
															-        declare_experts(first_peer, expert_uids[batch_start : batch_start + batch_size], "localhost:1234")
														
 
															+        declare_experts(first_peer, expert_uids[batch_start : batch_start + batch_size])
														
 
															     found = get_experts(other_peer, random.sample(expert_uids, 5) + ["foo", "bar"])
														
 
															     assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
														
 
															     assert all(res is None for res in found[-2:]), "Found non-existing experts"
														
 
															-    other_expert, other_port = "my_other_expert.1337", random.randint(1000, 9999)
														
 
															-    declare_experts(other_peer, [other_expert], f"that_host:{other_port}")
														
 
															+    other_expert = "my_other_expert.1337"
														
 
															+    declare_experts(other_peer, [other_expert])
														
 
															     first_notfound, first_found = get_experts(first_peer, ["foobar", other_expert])
														
 
															     assert isinstance(first_found, hivemind.RemoteExpert)
														
 
															-    assert first_found.endpoint == f"that_host:{other_port}"
														
 
															+    assert first_found.server_peer_info.peer_id == other_peer.peer_id
														
 
															+    assert first_notfound is None
														
 
															     # test graceful shutdown
														
 
															     first_peer.shutdown()
														
@@ -43,30 +44,31 @@ def test_store_get_experts(n_peers=10):
 
															     time.sleep(1.0)
														
 
															     remaining_peer1 = random.choice([peer for peer in peers if peer.is_alive()])
														
 
															     remaining_peer2 = random.choice([peer for peer in peers if peer.is_alive()])
														
 
															-    assert all(declare_experts(remaining_peer1, ["new_expert.1"], "dummy"))
														
 
															-    assert get_experts(remaining_peer2, ["new_expert.1"])[0].endpoint == "dummy"
														
 
															+    assert all(declare_experts(remaining_peer1, ["new_expert.1"]))
														
 
															+    assert get_experts(remaining_peer2, ["new_expert.1"])[0].server_peer_info.peer_id == remaining_peer1.peer_id
														
 
															 @pytest.mark.forked
														
 
															 def test_beam_search(
														
 
															     n_peers=20, total_experts=128, batch_size=32, beam_size=4, parallel_rpc=4, grid_dims=(32, 32, 32)
														
 
															 ):
														
 
															-    dht = [hivemind.DHT(start=True)]
														
 
															-    initial_peers = dht[0].get_visible_maddrs()
														
 
															-    dht += [hivemind.DHT(initial_peers=initial_peers, start=True) for _ in range(n_peers - 1)]
														
 
															+    dht_instances = [hivemind.DHT(start=True)]
														
 
															+    initial_peers = dht_instances[0].get_visible_maddrs()
														
 
															+    dht_instances += [hivemind.DHT(initial_peers=initial_peers, start=True) for _ in range(n_peers - 1)]
														
 
															     real_experts = sorted(
														
 
															         {"expert." + ".".join([str(random.randint(0, dim - 1)) for dim in grid_dims]) for _ in range(total_experts)}
														
 
															     )
														
 
															     for batch_start in range(0, len(real_experts), batch_size):
														
 
															+        dht = random.choice(dht_instances)
														
 
															         declare_experts(
														
 
															-            random.choice(dht),
														
 
															+            dht,
														
 
															             real_experts[batch_start : batch_start + batch_size],
														
 
															-            wait=True,
														
 
															-            endpoint=f"host{batch_start // batch_size}:{random.randint(0, 65536)}",
														
 
															         )
														
 
															-    neighbors = sum([peer.get_visible_maddrs() for peer in random.sample(dht, min(3, len(dht)))], [])
														
 
															+    neighbors = sum(
														
 
															+        [peer.get_visible_maddrs() for peer in random.sample(dht_instances, min(3, len(dht_instances)))], []
														
 
															+    )
														
 
															     you = hivemind.DHT(start=True, initial_peers=neighbors, parallel_rpc=parallel_rpc)
														
 
															     beam_search = MoEBeamSearcher(you, "expert.", grid_dims)
														
@@ -89,22 +91,25 @@ def test_dht_single_node():
 
															     node = hivemind.DHT(start=True)
														
 
															     beam_search = MoEBeamSearcher(node, "expert.", grid_size=(10,))
														
 
															-    assert all(declare_experts(node, ["expert.1", "expert.2", "expert.3"], f"{hivemind.LOCALHOST}:1337").values())
														
 
															-    assert len(declare_experts(node, ["ffn.1", "ffn.2"], endpoint="that_place")) == 4
														
 
															-    assert len(declare_experts(node, ["e.1.2.3", "e.1.2.5", "e.2.0"], f"{hivemind.LOCALHOST}:42")) == 7
														
 
															+    assert all(declare_experts(node, ["expert.1", "expert.2", "expert.3"]).values())
														
 
															+    assert len(declare_experts(node, ["ffn.1", "ffn.2"])) == 4
														
 
															+    assert len(declare_experts(node, ["e.1.2.3", "e.1.2.5", "e.2.0"])) == 7
														
 
															     for expert in get_experts(node, ["expert.3", "expert.2"]):
														
 
															-        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
														
 
															+        assert expert.server_peer_info.peer_id == node.peer_id
														
 
															-    assert all(declare_experts(node, ["expert.5", "expert.2"], f"{hivemind.LOCALHOST}:1337").values())
														
 
															+    assert all(declare_experts(node, ["expert.5", "expert.2"]).values())
														
 
															     found_experts = beam_search.find_best_experts([(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)], beam_size=2)
														
 
															     assert len(found_experts) == 2 and [expert.uid for expert in found_experts] == ["expert.5", "expert.3"]
														
 
															     successors = beam_search.get_active_successors(["e.1.2.", "e.2.", "e.4.5."])
														
 
															     assert len(successors["e.1.2."]) == 2
														
 
															-    assert successors["e.1.2."][3] == UidEndpoint("e.1.2.3", f"{LOCALHOST}:42")
														
 
															-    assert successors["e.1.2."][5] == UidEndpoint("e.1.2.5", f"{LOCALHOST}:42")
														
 
															-    assert len(successors["e.2."]) == 1 and successors["e.2."][0] == UidEndpoint("e.2.0", f"{LOCALHOST}:42")
														
 
															+
														
 
															+    peer_info = PeerInfo(node.peer_id, [a.decapsulate("/p2p/" + a.get("p2p")) for a in node.get_visible_maddrs()])
														
 
															+
														
 
															+    assert successors["e.1.2."][3] == UidEndpoint("e.1.2.3", peer_info)
														
 
															+    assert successors["e.1.2."][5] == UidEndpoint("e.1.2.5", peer_info)
														
 
															+    assert len(successors["e.2."]) == 1 and successors["e.2."][0] == UidEndpoint("e.2.0", peer_info)
														
 
															     assert successors["e.4.5."] == {}
														
 
															     initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3), beam_size=3)
														
@@ -194,7 +199,7 @@ async def test_negative_caching(n_peers=10):
 
															     peers += [hivemind.DHT(initial_peers=initial_peers, start=True, **dht_kwargs) for _ in range(n_peers - 1)]
														
 
															     writer_peer = random.choice(peers)
														
 
															-    assert all(declare_experts(writer_peer, ["ffn.1.2.3", "ffn.3.4.5"], "myaddr:1234").values())
														
 
															+    assert all(declare_experts(writer_peer, ["ffn.1.2.3", "ffn.3.4.5"]).values())
														
 
															     neighbors = sum([peer.get_visible_maddrs() for peer in random.sample(peers, min(3, len(peers)))], [])
														
 
															     neg_caching_peer = hivemind.DHT(initial_peers=neighbors, start=True, **dht_kwargs)
														
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -1,13 +1,14 @@
 
															-import grpc
														
 
															 import numpy as np
														
 
															 import pytest
														
 
															 import torch
														
 
															 from hivemind.dht import DHT
														
 
															-from hivemind.moe.client import RemoteExpert, RemoteMixtureOfExperts, RemoteSwitchMixtureOfExperts
														
 
															-from hivemind.moe.client.moe import DUMMY, _RemoteCallMany
														
 
															+from hivemind.moe.client.expert import RemoteExpert, RemoteExpertInfo, create_remote_experts
														
 
															+from hivemind.moe.client.moe import DUMMY, RemoteMixtureOfExperts, _RemoteCallMany
														
 
															+from hivemind.moe.client.switch_moe import RemoteSwitchMixtureOfExperts
														
 
															 from hivemind.moe.server import ExpertBackend, Server, background_server, declare_experts
														
 
															 from hivemind.moe.server.layers import name_to_block
														
 
															+from hivemind.p2p.p2p_daemon_bindings.control import P2PDaemonError
														
 
															 from hivemind.utils.tensor_descr import BatchTensorDescriptor
														
@@ -18,8 +19,8 @@ def test_moe():
 
															     ]
														
 
															     with background_server(
														
 
															         expert_uids=all_expert_uids, device="cpu", expert_cls="ffn", num_handlers=1, hidden_dim=16
														
 
															-    ) as (server_endpoint, dht_maddrs):
														
 
															-        dht = DHT(start=True, initial_peers=dht_maddrs)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(start=True, initial_peers=server_peer_info.addrs)
														
 
															         dmoe = RemoteMixtureOfExperts(in_features=16, grid_size=(4, 4, 4), dht=dht, k_best=3, uid_prefix="ffn.")
														
@@ -35,9 +36,8 @@ def test_no_experts():
 
															     ]
														
 
															     with background_server(
														
 
															         expert_uids=all_expert_uids, device="cpu", expert_cls="nop_delay", num_handlers=1, hidden_dim=16
														
 
															-    ) as (server_endpoint, dht_maddrs):
														
 
															-        dht = DHT(start=True, initial_peers=dht_maddrs)
														
 
															-
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(start=True, initial_peers=server_peer_info.addrs)
														
 
															         dmoe = RemoteSwitchMixtureOfExperts(
														
 
															             in_features=16,
														
 
															             grid_size=(4, 4, 4),
														
@@ -71,12 +71,16 @@ def test_call_many(hidden_dim=16):
 
															         num_handlers=1,
														
 
															         hidden_dim=hidden_dim,
														
 
															         optim_cls=None,
														
 
															-        no_dht=True,
														
 
															-    ) as (server_endpoint, _):
														
 
															+    ) as server_peer_info:
														
 
															         inputs = torch.randn(4, hidden_dim, requires_grad=True)
														
 
															         inputs_clone = inputs.clone().detach().requires_grad_(True)
														
 
															-        e0, e1, e2, e3, e4 = [RemoteExpert(f"expert.{i}", server_endpoint) for i in range(5)]
														
 
															-        e5 = RemoteExpert(f"thisshouldnotexist", "127.0.0.1:80")
														
 
															+
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        e0, e1, e2, e3, e4 = create_remote_experts(
														
 
															+            [RemoteExpertInfo(uid=f"expert.{i}", peer_info=server_peer_info) for i in range(5)],
														
 
															+            dht,
														
 
															+        )
														
 
															+        e5 = RemoteExpert(RemoteExpertInfo(f"thisshouldnotexist", server_peer_info), None)
														
 
															         mask, expert_outputs = _RemoteCallMany.apply(
														
 
															             DUMMY,
														
@@ -129,11 +133,15 @@ def test_remote_module_call(hidden_dim=16):
 
															         num_handlers=1,
														
 
															         hidden_dim=hidden_dim,
														
 
															         optim_cls=None,
														
 
															-        no_dht=True,
														
 
															-    ) as (server_endpoint, _):
														
 
															-        real_expert = RemoteExpert("expert.0", server_endpoint)
														
 
															-        fake_expert = RemoteExpert("oiasfjiasjf", server_endpoint)
														
 
															-
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        real_expert, fake_expert = create_remote_experts(
														
 
															+            [
														
 
															+                RemoteExpertInfo(uid="expert.0", peer_info=server_peer_info),
														
 
															+                RemoteExpertInfo(uid="oiasfjiasjf", peer_info=server_peer_info),
														
 
															+            ],
														
 
															+            dht=dht,
														
 
															+        )
														
 
															         out1 = real_expert(torch.randn(1, hidden_dim))
														
 
															         assert out1.shape == (1, hidden_dim)
														
 
															         dummy_x = torch.randn(3, hidden_dim, requires_grad=True)
														
@@ -144,9 +152,9 @@ def test_remote_module_call(hidden_dim=16):
 
															         out3_again.norm().backward()
														
 
															         assert dummy_x.grad is not None and dummy_x.grad.norm() > 0
														
 
															-        with pytest.raises(grpc.RpcError):
														
 
															+        with pytest.raises(P2PDaemonError):
														
 
															             real_expert(torch.randn(3, 11))
														
 
															-        with pytest.raises(grpc.RpcError):
														
 
															+        with pytest.raises(P2PDaemonError):
														
 
															             fake_expert(dummy_x)
														
@@ -154,11 +162,11 @@ def test_remote_module_call(hidden_dim=16):
 
															 def test_beam_search_correctness():
														
 
															     all_expert_uids = [f"ffn.{5 + i}.{10 + j}.{15 + k}" for i in range(10) for j in range(10) for k in range(10)]
														
 
															     dht = DHT(start=True)
														
 
															-    assert all(declare_experts(dht, all_expert_uids, endpoint="fake-endpoint"))
														
 
															+    assert all(declare_experts(dht, all_expert_uids))
														
 
															     dmoe = RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix="ffn.")
														
 
															-    for i in range(25):
														
 
															+    for _ in range(25):
														
 
															         input = torch.randn(32)
														
 
															         grid_scores = dmoe.proj(input).split_with_sizes(dmoe.beam_search.grid_size, dim=-1)
														
@@ -173,7 +181,7 @@ def test_beam_search_correctness():
 
															         # reference: independently find :beam_size: best experts with exhaustive search
														
 
															         all_scores = dmoe.compute_expert_scores(
														
 
															             [dim_scores.unsqueeze(0) for dim_scores in grid_scores],
														
 
															-            [[RemoteExpert(uid, "") for uid in all_expert_uids]],
														
 
															+            [[RemoteExpert(RemoteExpertInfo(uid, None), None) for uid in all_expert_uids]],
														
 
															         )[0]
														
 
															         true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[: len(chosen_experts)]
														
@@ -194,9 +202,12 @@ def test_determinism(hidden_dim=16):
 
															         num_handlers=1,
														
 
															         hidden_dim=hidden_dim,
														
 
															         optim_cls=None,
														
 
															-        no_dht=True,
														
 
															-    ) as (server_endpoint, _):
														
 
															-        expert = RemoteExpert(uid=f"expert.0", endpoint=server_endpoint)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        expert = create_remote_experts(
														
 
															+            [RemoteExpertInfo(uid="expert.0", peer_info=server_peer_info)],
														
 
															+            dht=dht,
														
 
															+        )[0]
														
 
															         out = expert(xx, mask)
														
 
															         out_rerun = expert(xx, mask)
														
@@ -220,7 +231,7 @@ def test_compute_expert_scores():
 
															         jj = [[2, 2, 1], [0, 1, 2, 0, 1], [0], [1, 2]]
														
 
															         batch_experts = [
														
 
															             [
														
 
															-                RemoteExpert(uid=f"expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}", endpoint="[::]:1337")
														
 
															+                RemoteExpert(RemoteExpertInfo(f"expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}", None), None)
														
 
															                 for expert_i in range(len(ii[batch_i]))
														
 
															             ]
														
 
															             for batch_i in range(len(ii))
														
@@ -261,9 +272,10 @@ def test_client_anomaly_detection():
 
															     server.start()
														
 
															     try:
														
 
															         server.ready.wait()
														
 
															+        client_side_dht = DHT(initial_peers=dht.get_visible_maddrs(), start=True)
														
 
															         dmoe = RemoteMixtureOfExperts(
														
 
															-            in_features=16, grid_size=(3,), dht=dht, k_best=3, uid_prefix="expert.", detect_anomalies=True
														
 
															+            in_features=16, grid_size=(3,), dht=client_side_dht, k_best=3, uid_prefix="expert.", detect_anomalies=True
														
 
															         )
														
 
															         input = torch.randn(1, 16)
														
@@ -280,7 +292,7 @@ def test_client_anomaly_detection():
 
															             inf_loss.backward()
														
 
															         dmoe = RemoteMixtureOfExperts(
														
 
															-            in_features=16, grid_size=(4,), dht=dht, k_best=4, uid_prefix="expert.", detect_anomalies=True
														
 
															+            in_features=16, grid_size=(4,), dht=client_side_dht, k_best=4, uid_prefix="expert.", detect_anomalies=True
														
 
															         )
														
 
															         output = dmoe(input)
														
 
															         assert output.isfinite().all()
														
--- a/tests/test_p2p_daemon_bindings.py
+++ b/tests/test_p2p_daemon_bindings.py
@@ -560,13 +560,19 @@ async def test_client_stream_handler_success(p2pcs):
 
															     writer.close()
														
 
															-    # test case: registering twice can override the previous registration
														
 
															+    # test case: registering twice can't override the previous registration without balanced flag
														
 
															     event_third = asyncio.Event()
														
 
															     async def handler_third(stream_info, reader, writer):
														
 
															         event_third.set()
														
 
															-    await p2pcs[1].stream_handler(another_proto, handler_third)
														
 
															+    # p2p raises now for doubled stream handlers
														
 
															+    with pytest.raises(ControlFailure):
														
 
															+        await p2pcs[1].stream_handler(another_proto, handler_third)
														
 
															+
														
 
															+    # add in balanced mode: handler should be placed in round robin queue
														
 
															+    # and become the next to be called
														
 
															+    await p2pcs[1].stream_handler(another_proto, handler_third, balanced=True)
														
 
															     assert another_proto in p2pcs[1].control.handlers
														
 
															     # ensure the handler is override
														
 
															     assert handler_third == p2pcs[1].control.handlers[another_proto]
														
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -8,7 +8,8 @@ import torch.nn.functional as F
 
															 from sklearn.datasets import load_digits
														
 
															 from hivemind import DHT
														
 
															-from hivemind.moe.client import RemoteExpert, RemoteMixtureOfExperts, RemoteSwitchMixtureOfExperts
														
 
															+from hivemind.moe.client import RemoteMixtureOfExperts, RemoteSwitchMixtureOfExperts
														
 
															+from hivemind.moe.client.expert import RemoteExpertInfo, create_remote_experts
														
 
															 from hivemind.moe.server import background_server
														
 
															 from hivemind.optim import DecentralizedAdam, DecentralizedSGD
														
@@ -19,12 +20,17 @@ def test_training(max_steps: int = 100, threshold: float = 0.9):
 
															     X_train, y_train = torch.tensor(dataset["data"], dtype=torch.float), torch.tensor(dataset["target"])
														
 
															     SGD = partial(torch.optim.SGD, lr=0.05)
														
 
															-    with background_server(num_experts=2, device="cpu", optim_cls=SGD, hidden_dim=64, num_handlers=1, no_dht=True) as (
														
 
															-        server_endpoint,
														
 
															-        _,
														
 
															-    ):
														
 
															-        expert1 = RemoteExpert("expert.0", server_endpoint)
														
 
															-        expert2 = RemoteExpert("expert.1", server_endpoint)
														
 
															+    with background_server(
														
 
															+        num_experts=2, device="cpu", optim_cls=SGD, hidden_dim=64, num_handlers=1
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(initial_peers=server_peer_info.addrs, start=True)
														
 
															+        expert1, expert2 = create_remote_experts(
														
 
															+            [
														
 
															+                RemoteExpertInfo(uid="expert.0", peer_info=server_peer_info),
														
 
															+                RemoteExpertInfo(uid="expert.1", peer_info=server_peer_info),
														
 
															+            ],
														
 
															+            dht=dht,
														
 
															+        )
														
 
															         model = nn.Sequential(expert2, nn.ReLU(), expert1, nn.Linear(64, 2))
														
 
															         opt = SGD(model.parameters(), lr=0.05)
														
@@ -54,8 +60,8 @@ def test_moe_training(max_steps: int = 100, threshold: float = 0.9, num_experts=
 
															     all_expert_uids = [f"expert.{i}" for i in range(num_experts)]
														
 
															     with background_server(
														
 
															         expert_uids=all_expert_uids, device="cpu", optim_cls=SGD, hidden_dim=64, num_handlers=1
														
 
															-    ) as (server_endpoint, dht_maddrs):
														
 
															-        dht = DHT(start=True, initial_peers=dht_maddrs)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(start=True, initial_peers=server_peer_info.addrs)
														
 
															         moe = RemoteMixtureOfExperts(in_features=64, grid_size=(num_experts,), dht=dht, uid_prefix="expert.", k_best=2)
														
 
															         model = nn.Sequential(moe, nn.Linear(64, 2))
														
@@ -107,8 +113,8 @@ def test_switch_training(max_steps: int = 10, threshold: float = 0.9, num_expert
 
															     all_expert_uids = [f"expert.{i}" for i in range(num_experts)]
														
 
															     with background_server(
														
 
															         expert_uids=all_expert_uids, device="cpu", optim_cls=SGD, hidden_dim=64, num_handlers=1
														
 
															-    ) as (server_endpoint, dht_maddrs):
														
 
															-        dht = DHT(start=True, initial_peers=dht_maddrs)
														
 
															+    ) as server_peer_info:
														
 
															+        dht = DHT(start=True, initial_peers=server_peer_info.addrs)
														
 
															         model = SwitchNetwork(dht, 64, 2, num_experts)
														
 
															         opt = SGD(model.parameters(), lr=0.05)
														
--- a/tests/test_util_modules.py
+++ b/tests/test_util_modules.py
@@ -11,9 +11,7 @@ import torch
 
															 import hivemind
														
 
															 from hivemind.compression import deserialize_torch_tensor, serialize_torch_tensor
														
 
															-from hivemind.proto.dht_pb2_grpc import DHTStub
														
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															-from hivemind.proto.runtime_pb2_grpc import ConnectionHandlerStub
														
 
															 from hivemind.utils import BatchTensorDescriptor, DHTExpiration, HeapEntry, MSGPackSerializer, ValueWithExpiration
														
 
															 from hivemind.utils.asyncio import (
														
 
															     achain,
														
@@ -330,50 +328,6 @@ def test_many_futures():
 
															     p.join()
														
 
															-@pytest.mark.forked
														
 
															-@pytest.mark.asyncio
														
 
															-async def test_channel_cache():
														
 
															-    hivemind.ChannelCache.MAXIMUM_CHANNELS = 3
														
 
															-    hivemind.ChannelCache.EVICTION_PERIOD_SECONDS = 0.1
														
 
															-
														
 
															-    c1 = hivemind.ChannelCache.get_stub("localhost:1337", DHTStub, aio=False)
														
 
															-    c2 = hivemind.ChannelCache.get_stub("localhost:1337", DHTStub, aio=True)
														
 
															-    c3 = hivemind.ChannelCache.get_stub("localhost:1338", DHTStub, aio=False)
														
 
															-    c3_again = hivemind.ChannelCache.get_stub("localhost:1338", DHTStub, aio=False)
														
 
															-    c1_again = hivemind.ChannelCache.get_stub("localhost:1337", DHTStub, aio=False)
														
 
															-    c4 = hivemind.ChannelCache.get_stub("localhost:1339", DHTStub, aio=True)
														
 
															-    c2_anew = hivemind.ChannelCache.get_stub("localhost:1337", DHTStub, aio=True)
														
 
															-    c1_yetagain = hivemind.ChannelCache.get_stub("localhost:1337", DHTStub, aio=False)
														
 
															-
														
 
															-    await asyncio.sleep(0.2)
														
 
															-    c1_anew = hivemind.ChannelCache.get_stub(target="localhost:1337", aio=False, stub_type=DHTStub)
														
 
															-    c1_anew_again = hivemind.ChannelCache.get_stub(target="localhost:1337", aio=False, stub_type=DHTStub)
														
 
															-    c1_otherstub = hivemind.ChannelCache.get_stub(target="localhost:1337", aio=False, stub_type=ConnectionHandlerStub)
														
 
															-    await asyncio.sleep(0.05)
														
 
															-    c1_otherstub_again = hivemind.ChannelCache.get_stub(
														
 
															-        target="localhost:1337", aio=False, stub_type=ConnectionHandlerStub
														
 
															-    )
														
 
															-    all_channels = [c1, c2, c3, c4, c3_again, c1_again, c2_anew, c1_yetagain, c1_anew, c1_anew_again, c1_otherstub]
														
 
															-
														
 
															-    assert all(isinstance(c, DHTStub) for c in all_channels[:-1])
														
 
															-    assert isinstance(all_channels[-1], ConnectionHandlerStub)
														
 
															-    assert "aio" in repr(c2.rpc_find)
														
 
															-    assert "aio" not in repr(c1.rpc_find)
														
 
															-
														
 
															-    duplicates = {
														
 
															-        (c1, c1_again),
														
 
															-        (c1, c1_yetagain),
														
 
															-        (c1_again, c1_yetagain),
														
 
															-        (c3, c3_again),
														
 
															-        (c1_anew, c1_anew_again),
														
 
															-        (c1_otherstub, c1_otherstub_again),
														
 
															-    }
														
 
															-    for i in range(len(all_channels)):
														
 
															-        for j in range(i + 1, len(all_channels)):
														
 
															-            ci, cj = all_channels[i], all_channels[j]
														
 
															-            assert (ci is cj) == ((ci, cj) in duplicates), (i, j)
														
 
															-
														
 
															-
														
 
															 def test_serialize_tuple():
														
 
															     test_pairs = (
														
 
															         ((1, 2, 3), [1, 2, 3]),
														
@@ -419,7 +373,7 @@ def test_split_parts():
 
															     for combined in combined_incomplete, combined_incomplete2, combined_incomplete3:
														
 
															         with pytest.raises(RuntimeError):
														
 
															             deserialize_torch_tensor(combined)
														
 
															-            # note: we rely on this being RuntimeError in hivemind.averaging.allreduce.AllreduceRunner
														
 
															+            # note: we rely on this being RuntimeError in hivemind.averaging.allreduce.AllReduceRunner
														
 
															 def test_generic_data_classes():