5 years ago · f496f2c14a
--- a/docs/modules/server.rst
+++ b/docs/modules/server.rst
@@ -1,4 +1,4 @@
 
				-``hivemind.server & runtime``
			
 
				+**Hivemind Server**
			
 
				 ========================================
			
 
				 
			
 
				 .. automodule:: hivemind.server
			
@@ -9,13 +9,10 @@
 
				    :members:
			
 
				    :member-order: bysource
			
 
				 
			
 
				-.. currentmodule:: hivemind.runtime
			
 
				-
			
 
				 .. autoclass:: Runtime
			
 
				     :members:
			
 
				     :member-order: bysource
			
 
				 
			
 
				-
			
 
				 .. autoclass:: ExpertBackend
			
 
				     :members: forward, backward, apply_gradients, get_info, get_pools
			
 
				     :member-order: bysource
			
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -1,7 +1,6 @@
 
				 from hivemind.client import *
			
 
				 from hivemind.dht import *
			
 
				-from hivemind.server import Server
			
 
				+from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				-from hivemind.runtime import *
			
 
				 
			
 
				 __version__ = '0.7.1'
			
--- a/hivemind/client/expert.py
+++ b/hivemind/client/expert.py
@@ -7,9 +7,11 @@ import torch
 
				 import torch.nn as nn
			
 
				 from torch.autograd.function import once_differentiable
			
 
				 
			
 
				-from hivemind.utils import nested_flatten, DUMMY, nested_pack, nested_compare
			
 
				+from hivemind.utils import nested_flatten, nested_pack, nested_compare, Endpoint
			
 
				 from hivemind.utils.grpc import serialize_torch_tensor, deserialize_torch_tensor, runtime_pb2, runtime_grpc
			
 
				 
			
 
				+DUMMY = torch.empty(0, requires_grad=True)  # dummy tensor that triggers autograd in RemoteExpert
			
 
				+
			
 
				 
			
 
				 class RemoteExpert(nn.Module):
			
 
				     """
			
@@ -20,20 +22,18 @@ class RemoteExpert(nn.Module):
 
				     Sending wrong input shapes can cause RemoteExpert to freeze indefinitely due to error in runtime.
			
 
				 
			
 
				     :param uid: unique expert identifier
			
 
				-    :param host: hostname where server operates
			
 
				-    :param port: port to which server listens
			
 
				+    :param endpoint: network endpoint of a server that services that expert, e.g. "201.123.321.99:1337" or "[::]:8080"
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, uid, host='127.0.0.1', port=8080):
			
 
				+    def __init__(self, uid, endpoint: Endpoint):
			
 
				         super().__init__()
			
 
				-        self.uid, self.host, self.port = uid, host, port
			
 
				-        self._channel, self._stub = None, None
			
 
				-        self._info = None
			
 
				+        self.uid, self.endpoint = uid, endpoint
			
 
				+        self._channel, self._stub, self._info = None, None, None
			
 
				 
			
 
				     @property
			
 
				     def stub(self):
			
 
				         if self._channel is None:
			
 
				-            self._channel = grpc.insecure_channel(f'{self.host}:{self.port}', options=[
			
 
				+            self._channel = grpc.insecure_channel(self.endpoint, options=[
			
 
				                 ('grpc.max_send_message_length', -1),
			
 
				                 ('grpc.max_receive_message_length', -1)
			
 
				             ])
			
@@ -57,8 +57,7 @@ class RemoteExpert(nn.Module):
 
				         if not nested_compare(forward_inputs, self.info['forward_schema']):
			
 
				             raise TypeError(f"Inputs do not match expert input schema. Did you pass the right number of parameters?")
			
 
				 
			
 
				-        flat_outputs = _RemoteModuleCall.apply(DUMMY, self.uid, self.host, self.port, self.stub,
			
 
				-                                               *nested_flatten(forward_inputs))
			
 
				+        flat_outputs = _RemoteModuleCall.apply(DUMMY, self.uid, self.stub, *nested_flatten(forward_inputs))
			
 
				         # Note: we send DUMMY to prevent torch from excluding expert from backward if no other inputs require grad
			
 
				         return nested_pack(flat_outputs, structure=self.info['outputs_schema'])
			
 
				 
			
@@ -70,18 +69,18 @@ class RemoteExpert(nn.Module):
 
				         return self._info
			
 
				 
			
 
				     def extra_repr(self):
			
 
				-        return f"uid={self.uid}, host={self.host}, port={self.port}"
			
 
				+        return f"uid={self.uid}, endpoint={self.endpoint}"
			
 
				 
			
 
				 
			
 
				 class _RemoteModuleCall(torch.autograd.Function):
			
 
				     """ Internal autograd-friendly call of a remote module. For applications, use RemoteExpert instead. """
			
 
				 
			
 
				     @staticmethod
			
 
				-    def forward(ctx, dummy: torch.Tensor, uid: str, host: str, port: int, stub: runtime_grpc.ConnectionHandlerStub,
			
 
				+    def forward(ctx, dummy: torch.Tensor, uid: str, stub: runtime_grpc.ConnectionHandlerStub,
			
 
				                 *inputs: torch.Tensor) -> Tuple[torch.Tensor, ...]:
			
 
				         # Note: *inputs are flattened input tensors that follow the expert's info['input_schema']
			
 
				         inputs = tuple(map(torch.Tensor.detach, inputs))  # detach to avoid pickling the computation graph
			
 
				-        ctx.uid, ctx.host, ctx.port, ctx.stub = uid, host, port, stub
			
 
				+        ctx.uid, ctx.stub = uid, stub
			
 
				         ctx.save_for_backward(*inputs)
			
 
				 
			
 
				         outputs = stub.forward(
			
@@ -100,4 +99,4 @@ class _RemoteModuleCall(torch.autograd.Function):
 
				             runtime_pb2.ExpertRequest(uid=ctx.uid, tensors=[serialize_torch_tensor(tensor) for tensor in payload]))
			
 
				 
			
 
				         deserialized_grad_inputs = [deserialize_torch_tensor(tensor) for tensor in grad_inputs.tensors]
			
 
				-        return (DUMMY, None, None, None, None, *deserialized_grad_inputs)
			
 
				+        return (DUMMY, None, None, *deserialized_grad_inputs)
			
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -6,8 +6,8 @@ import torch
 
				 import torch.nn as nn
			
 
				 from torch.autograd.function import once_differentiable
			
 
				 
			
 
				-from hivemind.client.expert import RemoteExpert, _RemoteModuleCall
			
 
				-from hivemind.utils import nested_map, run_and_await_k, nested_pack, nested_flatten, DUMMY, run_in_background, \
			
 
				+from hivemind.client.expert import RemoteExpert, _RemoteModuleCall, DUMMY
			
 
				+from hivemind.utils import nested_map, run_and_await_k, nested_pack, nested_flatten, run_in_background, \
			
 
				     run_isolated_forward, EmulatedAutogradContext, run_isolated_backward, map_with_parallel_backward
			
 
				 
			
 
				 
			
@@ -43,7 +43,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
				         self.dht, self.grid_size = dht, grid_size
			
 
				         self.uid_prefix, self.expert_padding = uid_prefix, expert_padding
			
 
				         self.k_best, self.k_min, self.backward_k_min = k_best, k_min, backward_k_min
			
 
				-        self.forward_timeout, self.timeout_after_k_min, self.backward_timeout = forward_timeout, timeout_after_k_min, backward_timeout
			
 
				+        self.forward_timeout, self.backward_timeout = forward_timeout, backward_timeout
			
 
				+        self.timeout_after_k_min = timeout_after_k_min
			
 
				         self.allow_broadcasting = allow_broadcasting
			
 
				 
			
 
				         self.proj = nn.Linear(in_features, sum(grid_size))  # jointly predict logits for all grid dimensions
			
@@ -258,11 +259,10 @@ class _RemoteMoECall(torch.autograd.Function):
 
				     @staticmethod
			
 
				     def _run_expert_forward(expert: RemoteExpert, *args: torch.Tensor, **kwargs: torch.Tensor):
			
 
				         """ Call remote expert and return flattened outputs. Compatible with concurrent autograd. """
			
 
				-        return run_isolated_forward(_RemoteModuleCall, DUMMY, expert.uid, expert.host, expert.port, expert.stub,
			
 
				-                                    *nested_flatten((args, kwargs)))
			
 
				+        return run_isolated_forward(_RemoteModuleCall, DUMMY, expert.uid, expert.stub, *nested_flatten((args, kwargs)))
			
 
				 
			
 
				     @staticmethod
			
 
				     def _run_expert_backward(ctx: EmulatedAutogradContext, weight: torch.Tensor, *grad_outputs: torch.Tensor):
			
 
				         backward_result = run_isolated_backward(_RemoteModuleCall, ctx, *(grad * weight for grad in grad_outputs))
			
 
				-        grad_dummy, no_grad_uid, no_grad_hostname, no_grad_port, no_grad_stub, *grad_inputs = backward_result
			
 
				+        grad_dummy, no_grad_uid, no_grad_stub, *grad_inputs = backward_result
			
 
				         return grad_inputs
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -16,21 +16,21 @@ import asyncio
 
				 import ctypes
			
 
				 import multiprocessing as mp
			
 
				 import warnings
			
 
				-from typing import List, Optional
			
 
				+from typing import List, Optional, Sequence
			
 
				 
			
 
				 import uvloop
			
 
				 
			
 
				 from hivemind.client import RemoteExpert
			
 
				 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
			
 
				 from hivemind.dht.routing import get_dht_time
			
 
				-from hivemind.utils import SharedFuture, Endpoint, run_in_background
			
 
				+from hivemind.utils import MPFuture, Endpoint, run_in_background
			
 
				 
			
 
				 
			
 
				 class DHT(mp.Process):
			
 
				     """
			
 
				     A high-level interface to hivemind DHT. Runs a dht node in a background process.
			
 
				 
			
 
				-    :param initial_peers: one or multiple pairs of (host, port) pointing to active DHT peers. Default: no peers
			
 
				+    :param initial_peers: one or multiple endpoints pointing to active DHT peers. Similar format to listen_on.
			
 
				     :param listen_on: an interface for incoming connections, e.g. "127.0.0.1:*", "0.0.0.0:1234" or "ipv6:[::]:*"
			
 
				     :param start: if True, automatically starts the background process on creation. Otherwise await manual start
			
 
				     :param daemon: if True, the background process is marked as daemon and automatically terminated after main process
			
@@ -42,12 +42,12 @@ class DHT(mp.Process):
 
				     EXPIRATION = 120  # anything written to DHT is considered expired after this many seconds
			
 
				     make_key = "{}::{}".format
			
 
				 
			
 
				-    def __init__(self, *initial_peers: Endpoint, listen_on: Endpoint = "0.0.0.0:*", start: bool, daemon: bool = True,
			
 
				-                 max_workers: Optional[int] = None, parallel_rpc: Optional[int] = None, **kwargs):
			
 
				+    def __init__(self, listen_on: Endpoint = "0.0.0.0:*", initial_peers: Sequence[Endpoint] = (), *, start: bool,
			
 
				+                 daemon: bool = True, max_workers: Optional[int] = None, parallel_rpc: Optional[int] = None, **kwargs):
			
 
				         super().__init__()
			
 
				         self.listen_on, self.initial_peers, self.kwargs = listen_on, initial_peers, kwargs
			
 
				         self.max_workers, self.parallel_rpc = max_workers, parallel_rpc
			
 
				-        self._port = mp.Value(ctypes.c_int32, 0)  # initialized after server starts
			
 
				+        self._port = mp.Value(ctypes.c_int32, 0)  # initialized after dht starts
			
 
				         self.node: Optional[DHTNode] = None  # initialized inside self.run only
			
 
				         self._pipe, self.pipe = mp.Pipe(duplex=True)
			
 
				         self.ready = mp.Event()
			
@@ -99,11 +99,11 @@ class DHT(mp.Process):
 
				         :param expiration: returns experts that expire no sooner than this (based on get_dht_time), default = now
			
 
				         :returns: a list of [RemoteExpert if found else None]
			
 
				         """
			
 
				-        future, _future = SharedFuture.make_pair()
			
 
				+        future, _future = MPFuture.make_pair()
			
 
				         self.pipe.send(('_get_experts', [], dict(uids=uids, expiration=expiration, future=_future)))
			
 
				         return future.result()
			
 
				 
			
 
				-    def _get_experts(self, uids: List[str], expiration: Optional[DHTExpiration], future: SharedFuture):
			
 
				+    def _get_experts(self, uids: List[str], expiration: Optional[DHTExpiration], future: MPFuture):
			
 
				         loop = asyncio.get_event_loop()
			
 
				         expiration = expiration or get_dht_time()
			
 
				         num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
			
@@ -114,41 +114,40 @@ class DHT(mp.Process):
 
				 
			
 
				         experts: List[Optional[RemoteExpert]] = [None] * len(uids)
			
 
				         for i, (key, uid) in enumerate(zip(keys, uids)):
			
 
				-            maybe_result, maybe_expiration = response[key]
			
 
				+            maybe_endpoint, maybe_expiration = response[key]
			
 
				             if maybe_expiration is not None:  # if we found a value
			
 
				-                experts[i] = RemoteExpert(uid=uid, host=maybe_result[0], port=maybe_result[1])
			
 
				+                experts[i] = RemoteExpert(uid=uid, endpoint=maybe_endpoint)
			
 
				 
			
 
				         future.set_result(experts)
			
 
				 
			
 
				-    def declare_experts(self, uids: List[str], addr, port, wait=True, timeout=None) -> Optional[List[bool]]:
			
 
				+    def declare_experts(self, uids: List[str], endpoint: Endpoint, wait=True, timeout=None) -> Optional[List[bool]]:
			
 
				         """
			
 
				-        Make experts available to DHT; update timestamps if already available
			
 
				+        Make experts visible to all DHT peers; update timestamps if declared previously.
			
 
				 
			
 
				         :param uids: a list of expert ids to update
			
 
				-        :param addr: hostname that can be used to call this expert
			
 
				-        :param port: port that can be used to call this expert
			
 
				+        :param endpoint: endpoint that serves these experts, usually your server endpoint (e.g. "201.111.222.333:1337")
			
 
				         :param wait: if True, awaits for declaration to finish, otherwise runs in background
			
 
				         :param timeout: waits for the procedure to finish, None means wait indeninitely
			
 
				         :returns: if wait, returns a list of booleans, (True = store succeeded, False = store rejected)
			
 
				         """
			
 
				-        future, _future = SharedFuture.make_pair() if wait else (None, None)
			
 
				-        self.pipe.send(('_declare_experts', [], dict(uids=list(uids), addr=addr, port=port, future=_future)))
			
 
				+        assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
 
				+        future, _future = MPFuture.make_pair() if wait else (None, None)
			
 
				+        self.pipe.send(('_declare_experts', [], dict(uids=list(uids), endpoint=endpoint, future=_future)))
			
 
				         if wait:
			
 
				             return future.result(timeout)
			
 
				 
			
 
				-    def _declare_experts(self, uids: List[str], addr: str, port: int, future: Optional[SharedFuture]):
			
 
				+    def _declare_experts(self, uids: List[str], endpoint: Endpoint, future: Optional[MPFuture]):
			
 
				         assert self.node is not None, "This method should only be accessed from inside .run method"
			
 
				         num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
			
 
				         loop = asyncio.get_event_loop()
			
 
				         expiration_time = get_dht_time() + self.EXPIRATION
			
 
				         unique_prefixes = set()
			
 
				-        coroutines = []
			
 
				 
			
 
				         keys, values = [], []
			
 
				         for uid in uids:
			
 
				             uid_parts = uid.split(self.UID_DELIMETER)
			
 
				             keys.append(self.make_key('expert', uid))
			
 
				-            values.append((addr, port))
			
 
				+            values.append(endpoint)
			
 
				             unique_prefixes.update([self.UID_DELIMETER.join(uid_parts[:i + 1]) for i in range(len(uid_parts))])
			
 
				 
			
 
				         for prefix in unique_prefixes:
			
@@ -171,12 +170,12 @@ class DHT(mp.Process):
 
				         :returns: a list of at most :k: prefixes that have at least one active expert each;
			
 
				         """
			
 
				         assert isinstance(prefixes, (list, tuple)), "please provide a list/tuple of prefixes as the first argument"
			
 
				-        future, _future = SharedFuture.make_pair()
			
 
				+        future, _future = MPFuture.make_pair()
			
 
				         self.pipe.send(('_first_k_active', [],
			
 
				                         dict(prefixes=prefixes, k=k, max_prefetch=max_prefetch or k, future=_future)))
			
 
				         return future.result()
			
 
				 
			
 
				-    def _first_k_active(self, prefixes: List[str], k: int, max_prefetch: Optional[int], future: SharedFuture):
			
 
				+    def _first_k_active(self, prefixes: List[str], k: int, max_prefetch: Optional[int], future: MPFuture):
			
 
				         assert self.node is not None, "This method should only be accessed from inside .run method"
			
 
				         max_prefetch = max_prefetch or len(prefixes)
			
 
				         loop = asyncio.get_event_loop()
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -314,7 +314,7 @@ class DHTNode:
 
				 
			
 
				         # search metadata
			
 
				         unfinished_key_ids = set(key_ids)  # track key ids for which the search is not terminated
			
 
				-        node_to_addr: Dict[DHTID, Endpoint] = dict()  # global routing table for all queries
			
 
				+        node_to_endpoint: Dict[DHTID, Endpoint] = dict()  # global routing table for all queries
			
 
				 
			
 
				         SearchResult = namedtuple("SearchResult", ["binary_value", "expiration", "source_node_id"])
			
 
				         latest_results = {key_id: SearchResult(b'', -float('inf'), None) for key_id in key_ids}
			
@@ -331,18 +331,18 @@ class DHTNode:
 
				 
			
 
				         # stage 2: traverse the DHT for any unfinished keys
			
 
				         for key_id in unfinished_key_ids:
			
 
				-            node_to_addr.update(self.protocol.routing_table.get_nearest_neighbors(
			
 
				+            node_to_endpoint.update(self.protocol.routing_table.get_nearest_neighbors(
			
 
				                 key_id, self.protocol.bucket_size, exclude=self.node_id))
			
 
				 
			
 
				         async def get_neighbors(peer: DHTID, queries: Collection[DHTID]) -> Dict[DHTID, Tuple[List[DHTID], bool]]:
			
 
				             queries = list(queries)
			
 
				-            response = await self.protocol.call_find(node_to_addr[peer], queries)
			
 
				+            response = await self.protocol.call_find(node_to_endpoint[peer], queries)
			
 
				             if not response:
			
 
				                 return {query: ([], False) for query in queries}
			
 
				 
			
 
				             output: Dict[DHTID, Tuple[List[DHTID], bool]] = {}
			
 
				             for key_id, (maybe_value, maybe_expiration, peers) in response.items():
			
 
				-                node_to_addr.update(peers)
			
 
				+                node_to_endpoint.update(peers)
			
 
				                 if maybe_expiration is not None and maybe_expiration > latest_results[key_id].expiration:
			
 
				                     latest_results[key_id] = SearchResult(maybe_value, maybe_expiration, peer)
			
 
				                 should_interrupt = (latest_results[key_id].expiration >= sufficient_expiration_time)
			
@@ -350,7 +350,7 @@ class DHTNode:
 
				             return output
			
 
				 
			
 
				         nearest_nodes_per_query, visited_nodes = await traverse_dht(
			
 
				-            queries=list(unfinished_key_ids), initial_nodes=list(node_to_addr),
			
 
				+            queries=list(unfinished_key_ids), initial_nodes=list(node_to_endpoint),
			
 
				             beam_size=beam_size, num_workers=num_workers, queries_per_call=int(len(unfinished_key_ids) ** 0.5),
			
 
				             get_neighbors=get_neighbors, visited_nodes={key_id: {self.node_id} for key_id in unfinished_key_ids})
			
 
				 
			
@@ -367,7 +367,7 @@ class DHTNode:
 
				                     if node_id == latest_node_id:
			
 
				                         continue
			
 
				                     asyncio.create_task(self.protocol.call_store(
			
 
				-                        node_to_addr[node_id], [key_id], [latest_value_bytes], [latest_expiration], in_cache=True))
			
 
				+                        node_to_endpoint[node_id], [key_id], [latest_value_bytes], [latest_expiration], in_cache=True))
			
 
				                     num_cached_nodes += 1
			
 
				                     if num_cached_nodes >= self.cache_nearest:
			
 
				                         break
			
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -4,7 +4,6 @@ from __future__ import annotations
 
				 import asyncio
			
 
				 import heapq
			
 
				 import os
			
 
				-import urllib.parse
			
 
				 from typing import Optional, List, Tuple, Dict, Iterator, Any, Sequence, Union, Collection
			
 
				 from warnings import warn
			
 
				 
			
@@ -12,7 +11,7 @@ import grpc
 
				 import grpc.experimental.aio
			
 
				 
			
 
				 from hivemind.dht.routing import RoutingTable, DHTID, BinaryDHTValue, DHTExpiration, get_dht_time
			
 
				-from hivemind.utils import Endpoint, compile_grpc, get_logger
			
 
				+from hivemind.utils import Endpoint, compile_grpc, get_logger, replace_port, get_port
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
@@ -40,7 +39,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				         See DHTNode (node.py) for a more detailed description.
			
 
				 
			
 
				         :note: the rpc_* methods defined in this class will be automatically exposed to other DHT nodes,
			
 
				-         for instance, def rpc_ping can be called as protocol.call_ping(addr, dht_id) from a remote machine
			
 
				+         for instance, def rpc_ping can be called as protocol.call_ping(endpoint, dht_id) from a remote machine
			
 
				          Only the call_* methods are meant to be called publicly, e.g. from DHTNode
			
 
				          Read more: https://github.com/bmuller/rpcudp/tree/master/rpcudp
			
 
				         """
			
@@ -109,9 +108,8 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				         """ Some node wants us to add it to our routing table. """
			
 
				         if peer_info.node_id and peer_info.rpc_port:
			
 
				             sender_id = DHTID.from_bytes(peer_info.node_id)
			
 
				-            peer_url = urllib.parse.urlparse(context.peer())
			
 
				-            address = peer_url.path[:peer_url.path.rindex(':')]
			
 
				-            asyncio.create_task(self.update_routing_table(sender_id, f"{address}:{peer_info.rpc_port}"))
			
 
				+            rpc_endpoint = replace_port(context.peer(), new_port=peer_info.rpc_port)
			
 
				+            asyncio.create_task(self.update_routing_table(sender_id, rpc_endpoint))
			
 
				         return self.node_info
			
 
				 
			
 
				     async def call_store(self, peer: Endpoint, keys: Sequence[DHTID], values: Sequence[BinaryDHTValue],
			
@@ -193,7 +191,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				                 output[key] = (value, expiration, nearest)
			
 
				             return output
			
 
				         except grpc.experimental.aio.AioRpcError as error:
			
 
				-            logger.warning(f"DHTProtocol failed to store at {peer}: {error.code()}")
			
 
				+            logger.warning(f"DHTProtocol failed to find at {peer}: {error.code()}")
			
 
				             asyncio.create_task(self.update_routing_table(self.routing_table.get(endpoint=peer), peer, responded=False))
			
 
				 
			
 
				     async def rpc_find(self, request: dht_pb2.FindRequest, context: grpc.ServicerContext) -> dht_pb2.FindResponse:
			
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -47,7 +47,7 @@ class RoutingTable:
 
				 
			
 
				     def add_or_update_node(self, node_id: DHTID, endpoint: Endpoint) -> Optional[Tuple[DHTID, Endpoint]]:
			
 
				         """
			
 
				-        Update routing table after an incoming request from :addr: (host:port) or outgoing request to :addr:
			
 
				+        Update routing table after an incoming request from :endpoint: or outgoing request to :endpoint:
			
 
				 
			
 
				         :returns: If we cannot add node_id to the routing table, return the least-recently-updated node (Section 2.2)
			
 
				         :note: DHTProtocol calls this method for every incoming and outgoing request if there was a response.
			
@@ -91,7 +91,7 @@ class RoutingTable:
 
				         """ Find endpoint for a given DHTID or vice versa """
			
 
				         return self.uid_to_endpoint[item] if isinstance(item, DHTID) else self.endpoint_to_uid[item]
			
 
				 
			
 
				-    def __setitem__(self, node_id: DHTID, addr: Endpoint) -> NotImplementedError:
			
 
				+    def __setitem__(self, node_id: DHTID, endpoint: Endpoint) -> NotImplementedError:
			
 
				         raise NotImplementedError("RoutingTable doesn't support direct item assignment. Use table.try_add_node instead")
			
 
				 
			
 
				     def __contains__(self, item: Union[DHTID, Endpoint]) -> bool:
			
@@ -160,7 +160,7 @@ class RoutingTable:
 
				 class KBucket:
			
 
				     """
			
 
				     A bucket containing up to :size: of DHTIDs in [lower, upper) semi-interval.
			
 
				-    Maps DHT node ids to their endpoints (hostname, addr)
			
 
				+    Maps DHT node ids to their endpoints
			
 
				     """
			
 
				 
			
 
				     def __init__(self, lower: int, upper: int, size: int, depth: int = 0):
			
@@ -175,13 +175,13 @@ class KBucket:
 
				         """ Check if node_id is between this bucket's lower and upper bounds """
			
 
				         return self.lower <= node_id < self.upper
			
 
				 
			
 
				-    def add_or_update_node(self, node_id: DHTID, addr: Endpoint) -> bool:
			
 
				+    def add_or_update_node(self, node_id: DHTID, endpoint: Endpoint) -> bool:
			
 
				         """
			
 
				         Add node to KBucket or update existing node, return True if successful, False if the bucket is full.
			
 
				         If the bucket is full, keep track of node in a replacement list, per section 4.1 of the paper.
			
 
				 
			
 
				         :param node_id: dht node identifier that should be added or moved to the front of bucket
			
 
				-        :param addr: a pair of (hostname, port) associated with that node id
			
 
				+        :param endpoint: network address associated with that node id
			
 
				         :note: this function has a side-effect of resetting KBucket.last_updated time
			
 
				         """
			
 
				         if node_id in self.nodes_requested_for_ping:
			
@@ -189,13 +189,13 @@ class KBucket:
 
				         self.last_updated = get_dht_time()
			
 
				         if node_id in self.nodes_to_endpoint:
			
 
				             del self.nodes_to_endpoint[node_id]
			
 
				-            self.nodes_to_endpoint[node_id] = addr
			
 
				+            self.nodes_to_endpoint[node_id] = endpoint
			
 
				         elif len(self.nodes_to_endpoint) < self.size:
			
 
				-            self.nodes_to_endpoint[node_id] = addr
			
 
				+            self.nodes_to_endpoint[node_id] = endpoint
			
 
				         else:
			
 
				             if node_id in self.replacement_nodes:
			
 
				                 del self.replacement_nodes[node_id]
			
 
				-            self.replacement_nodes[node_id] = addr
			
 
				+            self.replacement_nodes[node_id] = endpoint
			
 
				             return False
			
 
				         return True
			
 
				 
			
@@ -229,9 +229,9 @@ class KBucket:
 
				         assert self.lower < midpoint < self.upper, f"Bucket to small to be split: [{self.lower}: {self.upper})"
			
 
				         left = KBucket(self.lower, midpoint, self.size, depth=self.depth + 1)
			
 
				         right = KBucket(midpoint, self.upper, self.size, depth=self.depth + 1)
			
 
				-        for node_id, addr in chain(self.nodes_to_endpoint.items(), self.replacement_nodes.items()):
			
 
				+        for node_id, endpoint in chain(self.nodes_to_endpoint.items(), self.replacement_nodes.items()):
			
 
				             bucket = left if int(node_id) <= midpoint else right
			
 
				-            bucket.add_or_update_node(node_id, addr)
			
 
				+            bucket.add_or_update_node(node_id, endpoint)
			
 
				         return left, right
			
 
				 
			
 
				     def __repr__(self):
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -1,12 +1,16 @@
 
				 import multiprocessing as mp
			
 
				+import multiprocessing.synchronize
			
 
				 import threading
			
 
				 from typing import Dict, Optional
			
 
				 
			
 
				 from hivemind.dht import DHT
			
 
				-from hivemind.runtime import Runtime, ExpertBackend
			
 
				+from hivemind.server.runtime import Runtime
			
 
				+from hivemind.server.task_pool import Task, TaskPool, TaskPoolBase
			
 
				+from hivemind.server.expert_backend import ExpertBackend
			
 
				 from hivemind.server.checkpoint_saver import CheckpointSaver
			
 
				 from hivemind.server.connection_handler import ConnectionHandler
			
 
				 from hivemind.server.dht_handler import DHTHandlerThread
			
 
				+from hivemind.utils import Endpoint, get_port, replace_port, find_open_port
			
 
				 
			
 
				 
			
 
				 class Server(threading.Thread):
			
@@ -20,11 +24,10 @@ class Server(threading.Thread):
 
				      - follows orders from HivemindController - if it exists
			
 
				 
			
 
				     :type dht: DHT or None. Server with dht=None will NOT be visible from DHT,
			
 
				-     but it will still support accessing experts directly with RemoteExpert(uid=UID, host=IPADDR, port=PORT).
			
 
				+     but it will still support accessing experts directly with RemoteExpert(uid=UID, endpoint="IPADDR:PORT").
			
 
				     :param expert_backends: dict{expert uid (str) : ExpertBackend} for all expert hosted by this server.
			
 
				-    :param addr: server's dht address that determines how it can be accessed. Default is local connections only.
			
 
				-    :param port: port to which server listens for requests such as expert forward or backward pass.
			
 
				-    :param conn_handler_processes: maximum number of simultaneous requests. Please note that the default value of 1
			
 
				+    :param listen_on: server's dht address that determines how it can be accessed. Address and (optional) port
			
 
				+    :param num_connection_handlers: maximum number of simultaneous requests. Please note that the default value of 1
			
 
				         if too small for normal functioning, we recommend 4 handlers per expert backend.
			
 
				     :param update_period: how often will server attempt to publish its state (i.e. experts) to the DHT;
			
 
				         if dht is None, this parameter is ignored.
			
@@ -32,13 +35,16 @@ class Server(threading.Thread):
 
				         is ready (see .ready below)
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, dht: Optional[DHT], expert_backends: Dict[str, ExpertBackend], addr='127.0.0.1',
			
 
				-                 port: int = 8080, conn_handler_processes: int = 1, update_period: int = 30, start=False, checkpoint_dir=None, **kwargs):
			
 
				+    def __init__(
			
 
				+            self, dht: Optional[DHT], expert_backends: Dict[str, ExpertBackend], listen_on: Endpoint = "0.0.0.0:*",
			
 
				+            num_connection_handlers: int = 1, update_period: int = 30, start=False, checkpoint_dir=None, **kwargs):
			
 
				         super().__init__()
			
 
				         self.dht, self.experts, self.update_period = dht, expert_backends, update_period
			
 
				-        self.addr, self.port = addr, port
			
 
				-        self.conn_handlers = [ConnectionHandler(f"{self.addr}:{port}", self.experts)
			
 
				-                              for _ in range(conn_handler_processes)]
			
 
				+        if get_port(listen_on) is None:
			
 
				+            self.listen_on = listen_on = replace_port(listen_on, new_port=find_open_port())
			
 
				+        self.port = get_port(listen_on)
			
 
				+
			
 
				+        self.conn_handlers = [ConnectionHandler(listen_on, self.experts) for _ in range(num_connection_handlers)]
			
 
				         if checkpoint_dir is not None:
			
 
				             self.checkpoint_saver = CheckpointSaver(expert_backends, checkpoint_dir, update_period)
			
 
				         else:
			
@@ -57,8 +63,8 @@ class Server(threading.Thread):
 
				             if not self.dht.is_alive():
			
 
				                 self.dht.run_in_background(await_ready=True)
			
 
				 
			
 
				-            dht_handler_thread = DHTHandlerThread(experts=self.experts, dht=self.dht,
			
 
				-                                                  addr=self.addr, port=self.port, update_period=self.update_period)
			
 
				+            dht_handler_thread = DHTHandlerThread(
			
 
				+                experts=self.experts, dht=self.dht, endpoint=self.listen_on, update_period=self.update_period)
			
 
				             dht_handler_thread.start()
			
 
				         if self.checkpoint_saver is not None:
			
 
				             self.checkpoint_saver.start()
			
--- a/hivemind/server/checkpoint_saver.py
+++ b/hivemind/server/checkpoint_saver.py
@@ -8,20 +8,20 @@ from typing import Dict
 
				 
			
 
				 import torch
			
 
				 
			
 
				-from hivemind.runtime import ExpertBackend
			
 
				+from hivemind.server.expert_backend import ExpertBackend
			
 
				 
			
 
				 
			
 
				 class CheckpointSaver(threading.Thread):
			
 
				-    def __init__(self, expert_backends: Dict[str, ExpertBackend], dir: Path, update_period: int):
			
 
				+    def __init__(self, expert_backends: Dict[str, ExpertBackend], checkpoint_dir: Path, update_period: int):
			
 
				         super().__init__()
			
 
				         self.expert_backends = expert_backends
			
 
				         self.update_period = update_period
			
 
				-        self.dir = dir
			
 
				+        self.checkpoint_dir = checkpoint_dir
			
 
				         self.stop = False
			
 
				 
			
 
				     def run(self) -> None:
			
 
				         while not self.stop:
			
 
				-            store_experts(self.expert_backends, self.dir)
			
 
				+            store_experts(self.expert_backends, self.checkpoint_dir)
			
 
				             time.sleep(self.update_period)
			
 
				 
			
 
				 
			
--- a/hivemind/server/connection_handler.py
+++ b/hivemind/server/connection_handler.py
@@ -8,7 +8,7 @@ import grpc.experimental.aio
 
				 import torch
			
 
				 import uvloop
			
 
				 
			
 
				-from hivemind.runtime.expert_backend import ExpertBackend
			
 
				+from hivemind.server.expert_backend import ExpertBackend
			
 
				 from hivemind.utils import get_logger, serialize_torch_tensor, deserialize_torch_tensor, Endpoint, runtime_pb2, runtime_grpc
			
 
				 
			
 
				 logger = get_logger(__name__)
			
--- a/hivemind/server/dht_handler.py
+++ b/hivemind/server/dht_handler.py
@@ -2,13 +2,14 @@ import threading
 
				 import time
			
 
				 
			
 
				 from hivemind.dht import DHT
			
 
				+from hivemind.utils import Endpoint, get_port
			
 
				 
			
 
				 
			
 
				 class DHTHandlerThread(threading.Thread):
			
 
				-    def __init__(self, experts, dht: DHT, update_period: int = 5, addr: str = '127.0.0.1', port: int = 8080):
			
 
				+    def __init__(self, experts, dht: DHT, endpoint: Endpoint, update_period: int = 5):
			
 
				         super(DHTHandlerThread, self).__init__()
			
 
				-        self.port = port
			
 
				-        self.addr = addr
			
 
				+        assert get_port(endpoint) is not None
			
 
				+        self.endpoint = endpoint
			
 
				         self.experts = experts
			
 
				         self.dht = dht
			
 
				         self.update_period = update_period
			
@@ -16,5 +17,5 @@ class DHTHandlerThread(threading.Thread):
 
				 
			
 
				     def run(self) -> None:
			
 
				         while not self.stop:
			
 
				-            self.dht.declare_experts(self.experts.keys(), self.addr, self.port)
			
 
				+            self.dht.declare_experts(self.experts.keys(), self.endpoint)
			
 
				             time.sleep(self.update_period)
			
--- a/hivemind/runtime/expert_backend.py
+++ b/hivemind/runtime/expert_backend.py
@@ -3,8 +3,9 @@ from typing import Dict, Sequence, Any, Tuple, Union
 
				 import torch
			
 
				 from torch import nn
			
 
				 
			
 
				-from hivemind.runtime.task_pool import TaskPool
			
 
				-from hivemind.utils import nested_flatten, nested_pack, nested_compare, BatchTensorDescriptor, DUMMY_BATCH_SIZE, nested_map
			
 
				+from hivemind.server.task_pool import TaskPool
			
 
				+from hivemind.utils import nested_flatten, nested_pack, nested_compare, nested_map,\
			
 
				+    BatchTensorDescriptor, DUMMY_BATCH_SIZE
			
 
				 
			
 
				 
			
 
				 class ExpertBackend(nn.Module):
			
@@ -18,7 +19,7 @@ class ExpertBackend(nn.Module):
 
				 
			
 
				     :param expert: nn.Module to be wrapped into a backend. Arbitrary pytorch module with a few limitations:
			
 
				 
			
 
				-     - Experts must always receive the same set of \*args and \*\*kwargs and produce output tensors of same type
			
 
				+     - Experts must always receive the same set of args and kwargs and produce output tensors of same type
			
 
				      - All args, kwargs and outputs must be **tensors** where 0-th dimension represents to batch size
			
 
				      - We recommend using experts that are ~invariant to the order in which they process batches
			
 
				      - Using randomness (e.g. Dropout) leads to different samples at forward and backward. If you want consistency,
			
@@ -95,15 +96,17 @@ class ExpertBackend(nn.Module):
 
				 
			
 
				            .. todo correct state handling (see forward)
			
 
				 
			
 
				-           Please make sure to call ``ExpertBackend.apply_gradients`` **within** this method, otherwise the expert will not train
			
 
				+           Please make sure to call ``ExpertBackend.apply_gradients`` here, otherwise the expert will not train
			
 
				         """
			
 
				         (args, kwargs), grad_outputs = nested_pack(inputs, structure=self.backward_schema)
			
 
				 
			
 
				         with torch.enable_grad():
			
 
				             args = [tensor.detach().requires_grad_(True) if tensor.dtype in (torch.half, torch.float, torch.double)
			
 
				                     else tensor.detach() for tensor in args]
			
 
				-            kwargs = {input_key: (tensor.detach().requires_grad_(True) if tensor.dtype in (torch.half, torch.float, torch.double)
			
 
				-                                  else tensor.detach()) for input_key, tensor in kwargs.items()}
			
 
				+            kwargs = {input_key: (tensor.detach().requires_grad_(True)
			
 
				+                                  if tensor.dtype in (torch.half, torch.float, torch.double)
			
 
				+                                  else tensor.detach())
			
 
				+                      for input_key, tensor in kwargs.items()}
			
 
				 
			
 
				             outputs = self.expert(*args, **kwargs)
			
 
				             assert nested_compare(outputs, grad_outputs), "outputs and grad_outputs must have the same structure"
			
@@ -122,7 +125,7 @@ class ExpertBackend(nn.Module):
 
				 
			
 
				     def apply_gradients(self) -> None:
			
 
				         """
			
 
				-        Train the expert for a single step. This method is called by ``ExpertBackend.backward`` after computing gradients.
			
 
				+        Train the expert for one step. This method is called by ``ExpertBackend.backward`` after computing gradients.
			
 
				         """
			
 
				         self.opt.step()
			
 
				         self.opt.zero_grad()
			
--- a/hivemind/runtime/__init__.py
+++ b/hivemind/runtime/__init__.py
@@ -1,4 +1,5 @@
 
				 import multiprocessing as mp
			
 
				+import multiprocessing.pool
			
 
				 import threading
			
 
				 from itertools import chain
			
 
				 from selectors import DefaultSelector, EVENT_READ
			
@@ -7,8 +8,7 @@ from typing import Dict
 
				 import torch
			
 
				 from prefetch_generator import BackgroundGenerator
			
 
				 
			
 
				-from hivemind.runtime.expert_backend import ExpertBackend
			
 
				-from hivemind.runtime.task_pool import TaskPool, TaskPoolBase
			
 
				+from hivemind.server.expert_backend import ExpertBackend
			
 
				 from hivemind.utils import get_logger
			
 
				 
			
 
				 logger = get_logger(__name__)
			
--- a/hivemind/runtime/task_pool.py
+++ b/hivemind/runtime/task_pool.py
@@ -3,6 +3,7 @@ Task pool is responsible for receiving tasks and grouping them together for proc
 
				 """
			
 
				 import ctypes
			
 
				 import multiprocessing as mp
			
 
				+import multiprocessing.context
			
 
				 import os
			
 
				 import threading
			
 
				 import time
			
@@ -14,7 +15,7 @@ from typing import List, Tuple, Dict, Any, Generator
 
				 
			
 
				 import torch
			
 
				 
			
 
				-from hivemind.utils import SharedFuture, get_logger
			
 
				+from hivemind.utils import MPFuture, get_logger
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 Task = namedtuple("Task", ("future", "args"))
			
@@ -86,10 +87,10 @@ class TaskPool(TaskPoolBase):
 
				 
			
 
				     def submit_task(self, *args: torch.Tensor) -> Future:
			
 
				         """ Add task to this pool's queue, return Future for its output """
			
 
				-        future1, future2 = SharedFuture.make_pair()
			
 
				+        future1, future2 = MPFuture.make_pair()
			
 
				         task = Task(future1, args)
			
 
				         if self.get_task_size(task) > self.max_batch_size:
			
 
				-            exc = ValueError(f"Task size greater than max_batch_size ({self.max_batch_size}), it will never be finished")
			
 
				+            exc = ValueError(f"Task size greater than max_batch_size ({self.max_batch_size}), it can't be processed")
			
 
				             future2.set_exception(exc)
			
 
				         else:
			
 
				             self.tasks.put(task)
			
@@ -127,7 +128,7 @@ class TaskPool(TaskPoolBase):
 
				     def run(self, *args, **kwargs):
			
 
				         torch.set_num_threads(1)
			
 
				         logger.info(f'{self.uid} starting, pid={os.getpid()}')
			
 
				-        pending_batches = {}  # Dict[batch uuid, List[SharedFuture]] for each batch currently in runtime
			
 
				+        pending_batches = {}  # Dict[batch uuid, List[MPFuture]] for each batch currently in runtime
			
 
				         output_thread = threading.Thread(target=self._pool_output_loop, args=[pending_batches],
			
 
				                                          name=f'{self.uid}_output')
			
 
				         try:
			
--- a/hivemind/utils/__init__.py
+++ b/hivemind/utils/__init__.py
@@ -1,9 +1,8 @@
 
				-from hivemind.utils.connection import *
			
 
				-from hivemind.utils.data import *
			
 
				+from hivemind.utils.networking import *
			
 
				 from hivemind.utils.nested import *
			
 
				 from hivemind.utils.tensor_descr import *
			
 
				 from hivemind.utils.serializer import *
			
 
				-from hivemind.utils.shared_future import *
			
 
				+from hivemind.utils.mpfuture import *
			
 
				 from hivemind.utils.threading import *
			
 
				 from hivemind.utils.autograd import *
			
 
				 from hivemind.utils.grpc import *
			
--- a/hivemind/utils/autograd.py
+++ b/hivemind/utils/autograd.py
@@ -92,7 +92,8 @@ class _ParallelApplyFunction(torch.autograd.Function):
 
				     @staticmethod
			
 
				     def backward(ctx, *grad_outputs_flat: torch.Tensor):
			
 
				         func, contexts, output_strides = ctx._inner_func, ctx._call_contexts, ctx._output_strides
			
 
				-        grad_outputs_per_call = [grad_outputs_flat[output_strides[i]: output_strides[i + 1]] for i in range(len(contexts))]
			
 
				+        grad_outputs_per_call = [grad_outputs_flat[output_strides[i]: output_strides[i + 1]]
			
 
				+                                 for i in range(len(contexts))]
			
 
				         futures = [run_in_background(run_isolated_backward, func, context, *grads)
			
 
				                    for context, grads in zip(contexts, grad_outputs_per_call)]
			
 
				         flat_grads_wrt_input = tuple(grad for future in futures for grad in future.result())
			
--- a/hivemind/utils/data.py
+++ b/hivemind/utils/data.py
@@ -1,3 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-DUMMY = torch.empty(0, requires_grad=True)
			
--- a/hivemind/utils/grpc.py
+++ b/hivemind/utils/grpc.py
@@ -47,7 +47,8 @@ def compile_grpc(proto: str, *args: str) -> Tuple[Namespace, Namespace]:
 
				                 raise ImportError("Something changed sys.path while compile_grpc was in progress.")
			
 
				 
			
 
				 
			
 
				-with open(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'server', 'connection_handler.proto')) as f_proto:
			
 
				+with open(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
			
 
				+                       'server', 'connection_handler.proto')) as f_proto:
			
 
				     runtime_pb2, runtime_grpc = compile_grpc(f_proto.read())
			
 
				 
			
 
				 
			
--- a/hivemind/utils/logging.py
+++ b/hivemind/utils/logging.py
@@ -8,8 +8,8 @@ def get_logger(module_name: str) -> logging.Logger:
 
				     loglevel = os.getenv('LOGLEVEL', 'INFO')
			
 
				 
			
 
				     logging.addLevelName(logging.WARNING, 'WARN')
			
 
				-    formatter = logging.Formatter(fmt='[{asctime}.{msecs:03.0f}][{levelname}][{name}.{funcName}:{lineno}] {message}', style='{',
			
 
				-                                  datefmt='%Y/%m/%d %H:%M:%S')
			
 
				+    formatter = logging.Formatter(fmt='[{asctime}.{msecs:03.0f}][{levelname}][{name}.{funcName}:{lineno}] {message}',
			
 
				+                                  style='{', datefmt='%Y/%m/%d %H:%M:%S')
			
 
				     handler = logging.StreamHandler()
			
 
				     handler.setFormatter(formatter)
			
 
				     logger = logging.getLogger(name_without_prefix)
			
--- a/hivemind/utils/shared_future.py
+++ b/hivemind/utils/shared_future.py
@@ -5,7 +5,7 @@ from warnings import warn
 
				 import asyncio
			
 
				 
			
 
				 
			
 
				-class SharedFuture(Future):
			
 
				+class MPFuture(Future):
			
 
				     """ Multiprocessing version of concurrent.futures.Future, interacts between two processes via Pipe """
			
 
				     STATES = 'pending', 'running', 'cancelled', 'finished', 'exception'
			
 
				     STATE_PENDING, STATE_RUNNING, STATE_CANCELLED, STATE_FINISHED, STATE_EXCEPTION = STATES
			
--- a/hivemind/utils/networking.py
+++ b/hivemind/utils/networking.py
@@ -1,11 +1,27 @@
 
				 import socket
			
 
				+import urllib.parse
			
 
				 from contextlib import closing
			
 
				+from typing import Optional
			
 
				 
			
 
				 Hostname, Port = str, int  # flavour types
			
 
				 Endpoint = str  # e.g. 1.2.3.4:1337 or [2a21:6с8:b192:2105]:8888, https://networkengineering.stackexchange.com/a/9435
			
 
				 LOCALHOST = '127.0.0.1'
			
 
				 
			
 
				 
			
 
				+def get_port(endpoint: Endpoint) -> Optional[Port]:
			
 
				+    """ get port or None if port is undefined """
			
 
				+    # TODO: find a standard way to get port, make sure it works in malformed ports
			
 
				+    try:
			
 
				+        return int(endpoint[endpoint.rindex(':') + 1:], base=10)
			
 
				+    except ValueError:  # :* or not specified
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def replace_port(endpoint: Endpoint, new_port: Port) -> Endpoint:
			
 
				+    assert endpoint.endswith(':*') or get_port(endpoint) is not None, endpoint
			
 
				+    return f"{endpoint[:endpoint.rindex(':')]}:{new_port}"
			
 
				+
			
 
				+
			
 
				 def find_open_port(params=(socket.AF_INET, socket.SOCK_STREAM), opt=(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)):
			
 
				     """ Finds a tcp port that can be occupied with a socket with *params and use *opt options """
			
 
				     try:
			
--- a/hivemind/utils/serializer.py
+++ b/hivemind/utils/serializer.py
@@ -2,11 +2,9 @@
 
				 import pickle
			
 
				 from io import BytesIO
			
 
				 
			
 
				-import joblib
			
 
				 import torch
			
 
				 import umsgpack
			
 
				 
			
 
				-
			
 
				 class SerializerBase:
			
 
				     @staticmethod
			
 
				     def dumps(obj: object) -> bytes:
			
@@ -17,19 +15,6 @@ class SerializerBase:
 
				         raise NotImplementedError()
			
 
				 
			
 
				 
			
 
				-class JoblibSerializer(SerializerBase):
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def dumps(obj: object) -> bytes:
			
 
				-        s = BytesIO()
			
 
				-        joblib.dump(obj, s)
			
 
				-        return s.getvalue()
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def loads(buf: bytes) -> object:
			
 
				-        return joblib.load(BytesIO(buf))
			
 
				-
			
 
				-
			
 
				 class PickleSerializer(SerializerBase):
			
 
				     @staticmethod
			
 
				     def dumps(obj: object) -> bytes:
			
@@ -41,7 +26,6 @@ class PickleSerializer(SerializerBase):
 
				 
			
 
				 
			
 
				 class PytorchSerializer(SerializerBase):
			
 
				-
			
 
				     @staticmethod
			
 
				     def dumps(obj: object) -> bytes:
			
 
				         s = BytesIO()
			
@@ -54,7 +38,6 @@ class PytorchSerializer(SerializerBase):
 
				 
			
 
				 
			
 
				 class MSGPackSerializer(SerializerBase):
			
 
				-
			
 
				     @staticmethod
			
 
				     def dumps(obj: object) -> bytes:
			
 
				         return umsgpack.dumps(obj, use_bin_type=False)  # TODO strict https://github.com/msgpack/msgpack-python/pull/158
			
--- a/hivemind/utils/tensor_descr.py
+++ b/hivemind/utils/tensor_descr.py
@@ -45,7 +45,8 @@ class BatchTensorDescriptor(TensorDescriptor):
 
				     @classmethod
			
 
				     def from_tensor(cls, tensor: torch.Tensor):
			
 
				         return cls(*tensor.shape[1:], dtype=tensor.dtype, layout=tensor.layout,
			
 
				-                   device=tensor.device, requires_grad=tensor.requires_grad, pin_memory=torch.cuda.is_available() and tensor.is_pinned())
			
 
				+                   device=tensor.device, requires_grad=tensor.requires_grad,
			
 
				+                   pin_memory=torch.cuda.is_available() and tensor.is_pinned())
			
 
				 
			
 
				     def make_empty(self, batch_size, **kwargs):
			
 
				         assert self.shape[0] is None, "Make sure 0-th dimension is not specified (set to None)"
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 
				 torch>=1.3.0
			
 
				-joblib>=0.13
			
 
				 numpy>=1.17
			
 
				 prefetch_generator>=1.0.1
			
 
				 umsgpack
			
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -9,9 +9,9 @@ from tqdm import trange
 
				 from test_utils import increase_file_limit
			
 
				 
			
 
				 
			
 
				-def random_endpoint() -> Tuple[str, int]:
			
 
				-    return (f"{random.randint(0, 256)}.{random.randint(0, 256)}."
			
 
				-            f"{random.randint(0, 256)}.{random.randint(0, 256)}", random.randint(0, 65535))
			
 
				+def random_endpoint() -> hivemind.Endpoint:
			
 
				+    return f"{random.randint(0, 256)}.{random.randint(0, 256)}.{random.randint(0, 256)}." \
			
 
				+           f"{random.randint(0, 256)}:{random.randint(0, 65535)}"
			
 
				 
			
 
				 
			
 
				 def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_batch_size: int, random_seed: int,
			
@@ -23,7 +23,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
				     peers = []
			
 
				     for _ in trange(num_peers):
			
 
				         neighbors = [f'0.0.0.0:{node.port}' for node in random.sample(peers, min(initial_peers, len(peers)))]
			
 
				-        peer = hivemind.DHT(*neighbors, start=True, wait_timeout=wait_timeout, listen_on=f'0.0.0.0:*')
			
 
				+        peer = hivemind.DHT(initial_peers=neighbors, start=True, wait_timeout=wait_timeout, listen_on=f'0.0.0.0:*')
			
 
				         peers.append(peer)
			
 
				 
			
 
				     store_peer, get_peer = peers[-2:]
			
@@ -41,7 +41,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
				     for start in trange(0, num_experts, expert_batch_size):
			
 
				         store_start = time.perf_counter()
			
 
				         endpoints.append(random_endpoint())
			
 
				-        success_list = store_peer.declare_experts(expert_uids[start: start + expert_batch_size], *endpoints[-1])
			
 
				+        success_list = store_peer.declare_experts(expert_uids[start: start + expert_batch_size], endpoints[-1])
			
 
				         total_store_time += time.perf_counter() - store_start
			
 
				 
			
 
				         total_stores += len(success_list)
			
@@ -64,7 +64,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
				 
			
 
				         for i, expert in enumerate(get_result):
			
 
				             if expert is not None and expert.uid == expert_uids[start + i] \
			
 
				-                    and (expert.host, expert.port) == endpoints[start // expert_batch_size]:
			
 
				+                    and expert.endpoint == endpoints[start // expert_batch_size]:
			
 
				                 successful_gets += 1
			
 
				 
			
 
				     if time.perf_counter() - benchmark_started > expiration_time:
			
--- a/tests/benchmark_throughput.py
+++ b/tests/benchmark_throughput.py
@@ -14,7 +14,7 @@ from hivemind import find_open_port
 
				 def client_process(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches, backprop=True):
			
 
				     torch.set_num_threads(1)
			
 
				     can_start.wait()
			
 
				-    experts = [hivemind.RemoteExpert(f"expert{i}", port=port) for i in range(num_experts)]
			
 
				+    experts = [hivemind.RemoteExpert(f"expert{i}", endpoint=f"{hivemind.LOCALHOST}:{port}") for i in range(num_experts)]
			
 
				 
			
 
				     try:
			
 
				         dummy_batch = torch.randn(batch_size, hid_dim)
			
@@ -69,7 +69,8 @@ def benchmark_throughput(num_experts=16, num_handlers=None, num_clients=128, num
 
				                                                            max_batch_size=max_batch_size,
			
 
				                                                            )
			
 
				         timestamps['created_experts'] = time.perf_counter()
			
 
				-        server = hivemind.Server(None, experts, port=port, conn_handler_processes=num_handlers, device=device)
			
 
				+        server = hivemind.Server(None, experts, listen_on=f"{hivemind.LOCALHOST}:{port}",
			
 
				+                                 num_connection_handlers=num_handlers, device=device)
			
 
				         server.start()
			
 
				         server.ready.wait()
			
 
				         timestamps['server_ready'] = time.perf_counter()
			
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -5,7 +5,7 @@ import random
 
				 import heapq
			
 
				 import uuid
			
 
				 from itertools import chain
			
 
				-from typing import Optional
			
 
				+from typing import Optional, Tuple
			
 
				 import numpy as np
			
 
				 
			
 
				 import hivemind
			
@@ -68,11 +68,12 @@ def test_dht_protocol():
 
				                 protocol.call_find(f'{LOCALHOST}:{peer1_port}', [key]))[key]
			
 
				             recv_value = hivemind.MSGPackSerializer.loads(recv_value_bytes)
			
 
				             (recv_id, recv_endpoint) = next(iter(nodes_found.items()))
			
 
				-            assert recv_id == peer2_id and recv_endpoint == f"{LOCALHOST}:{peer2_port}", \
			
 
				+            assert recv_id == peer2_id and ':'.join(recv_endpoint.split(':')[-2:]) == f"{LOCALHOST}:{peer2_port}", \
			
 
				                 f"expected id={peer2_id}, peer={LOCALHOST}:{peer2_port} but got {recv_id}, {recv_endpoint}"
			
 
				 
			
 
				-            assert recv_value == value and recv_expiration == expiration, "call_find_value expected " \
			
 
				-                                                                          f"{value} (expires by {expiration}) but got {recv_value} (expires by {recv_expiration})"
			
 
				+            assert recv_value == value and recv_expiration == expiration, \
			
 
				+                f"call_find_value expected {value} (expires by {expiration}) " \
			
 
				+                f"but got {recv_value} (expires by {recv_expiration})"
			
 
				 
			
 
				             # peer 2 must know about peer 1, but not have a *random* nonexistent value
			
 
				             dummy_key = DHTID.generate()
			
@@ -89,7 +90,7 @@ def test_dht_protocol():
 
				 
			
 
				             if listen:
			
 
				                 loop.run_until_complete(protocol.shutdown())
			
 
				-            print("DHTProtocol test finished sucessfully!")
			
 
				+            print("DHTProtocol test finished successfully!")
			
 
				             test_success.set()
			
 
				 
			
 
				     tester = mp.Process(target=_tester, daemon=True)
			
@@ -178,13 +179,15 @@ def test_dht_node():
 
				 
			
 
				         # test 1: find self
			
 
				         nearest = loop.run_until_complete(me.find_nearest_nodes([me.node_id], k_nearest=1))[me.node_id]
			
 
				-        assert len(nearest) == 1 and nearest[me.node_id] == f"{LOCALHOST}:{me.port}"
			
 
				+        assert len(nearest) == 1 and ':'.join(nearest[me.node_id].split(':')[-2:]) == f"{LOCALHOST}:{me.port}"
			
 
				 
			
 
				         # test 2: find others
			
 
				         for i in range(10):
			
 
				             ref_endpoint, query_id = random.choice(list(dht.items()))
			
 
				             nearest = loop.run_until_complete(me.find_nearest_nodes([query_id], k_nearest=1))[query_id]
			
 
				-            assert len(nearest) == 1 and next(iter(nearest.items())) == (query_id, ref_endpoint)
			
 
				+            assert len(nearest) == 1
			
 
				+            found_node_id, found_endpoint = next(iter(nearest.items()))
			
 
				+            assert found_node_id == query_id and ':'.join(found_endpoint.split(':')[-2:]) == ref_endpoint
			
 
				 
			
 
				         # test 3: find neighbors to random nodes
			
 
				         accuracy_numerator = accuracy_denominator = 0  # top-1 nearest neighbor accuracy
			
@@ -266,7 +269,7 @@ def test_hivemind_dht():
 
				     peers = [hivemind.DHT(start=True)]
			
 
				     for i in range(10):
			
 
				         neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				-        peers.append(hivemind.DHT(*neighbors_i, start=True))
			
 
				+        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))
			
 
				 
			
 
				     you: hivemind.dht.DHT = random.choice(peers)
			
 
				     theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)
			
@@ -281,10 +284,10 @@ def test_hivemind_dht():
 
				     assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				 
			
 
				     that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint(1000, 9999)
			
 
				-    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], 'that_host', that_guys_port)
			
 
				+    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}')
			
 
				     you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
			
 
				     assert isinstance(you_found, hivemind.RemoteExpert)
			
 
				-    assert you_found.host == 'that_host', you_found.port == that_guys_port
			
 
				+    assert you_found.endpoint == f'that_host:{that_guys_port}'
			
 
				 
			
 
				     # test first_k_active
			
 
				     assert theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10) == expert_uids[:10]
			
@@ -302,9 +305,9 @@ def test_hivemind_dht():
 
				 
			
 
				 def test_dht_single_node():
			
 
				     node = hivemind.DHT(start=True)
			
 
				-    assert all(node.declare_experts(['e1', 'e2', 'e3'], hivemind.LOCALHOST, 1337))
			
 
				+    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:{1337}"))
			
 
				     for expert in node.get_experts(['e3', 'e2']):
			
 
				-        assert expert.host == hivemind.LOCALHOST and expert.port == 1337
			
 
				+        assert expert.endpoint == f"{hivemind.LOCALHOST}:{1337}"
			
 
				     assert node.first_k_active(['e0', 'e1', 'e3', 'e5', 'e2'], k=2) == ['e1', 'e3']
			
 
				 
			
 
				 
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -19,8 +19,8 @@ def test_remote_module_call():
 
				     random_proj = torch.randn_like(xx)
			
 
				 
			
 
				     with background_server(num_experts=num_experts, device='cpu', num_handlers=1,
			
 
				-                           no_optimizer=True, no_dht=True) as (localhost, server_port, dht_port):
			
 
				-        experts = [hivemind.RemoteExpert(uid=f'expert.{i}', port=server_port) for i in range(num_experts)]
			
 
				+                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				+        experts = [hivemind.RemoteExpert(uid=f'expert.{i}', endpoint=server_endpoint) for i in range(num_experts)]
			
 
				         moe_output, = hivemind.client.moe._RemoteMoECall.apply(
			
 
				             logits, experts[:len(logits)], k_min, timeout_after_k_min, backward_k_min, timeout_total, backward_timeout,
			
 
				             [(None,), {}], xx)
			
@@ -51,8 +51,8 @@ def test_determinism():
 
				     mask = torch.randint(0, 1, (32, 1024))
			
 
				 
			
 
				     with background_server(num_experts=1, device='cpu', expert_cls='det_dropout', num_handlers=1,
			
 
				-                           no_optimizer=True, no_dht=True) as (interface, server_port, dht_port):
			
 
				-        expert = hivemind.RemoteExpert(uid=f'expert.0', port=server_port)
			
 
				+                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				+        expert = hivemind.RemoteExpert(uid=f'expert.0', endpoint=server_endpoint)
			
 
				 
			
 
				         out = expert(xx, mask)
			
 
				         out_rerun = expert(xx, mask)
			
@@ -70,11 +70,11 @@ def test_compute_expert_scores():
 
				         moe = hivemind.client.moe.RemoteMixtureOfExperts(
			
 
				             dht=dht, in_features=1024, grid_size=(40,), k_best=4, k_min=1, timeout_after_k_min=1,
			
 
				             uid_prefix='expert')
			
 
				-        gx, gy = torch.randn(4, 5, requires_grad=True), torch.torch.randn(4, 3, requires_grad=True)
			
 
				+        gx, gy = torch.randn(4, 5, requires_grad=True), torch.randn(4, 3, requires_grad=True)
			
 
				         ii = [[4, 0, 2], [3, 1, 1, 1, 3], [0], [3, 2]]
			
 
				         jj = [[2, 2, 1], [0, 1, 2, 0, 1], [0], [1, 2]]
			
 
				         batch_experts = [
			
 
				-            [hivemind.RemoteExpert(uid=f'expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}')
			
 
				+            [hivemind.RemoteExpert(uid=f'expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}', endpoint="[::]:1337")
			
 
				              for expert_i in range(len(ii[batch_i]))]
			
 
				             for batch_i in range(len(ii))
			
 
				         ]  # note: these experts do not exists on server, we use them only to test moe compute_expert_scores
			
--- a/tests/test_routing.py
+++ b/tests/test_routing.py
@@ -115,16 +115,17 @@ def test_routing_table_search():
 
				             k = random.randint(1, 100)
			
 
				             query_id = DHTID.generate()
			
 
				             exclude = query_id if random.random() < 0.5 else None
			
 
				-            our_knn, our_addrs = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=exclude))
			
 
				+            our_knn, our_endpoints = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=exclude))
			
 
				             reference_knn = heapq.nsmallest(k, all_active_neighbors, key=query_id.xor_distance)
			
 
				             assert all(our == ref for our, ref in zip_longest(our_knn, reference_knn))
			
 
				-            assert all(our_addr == routing_table[our_node] for our_node, our_addr in zip(our_knn, our_addrs))
			
 
				+            assert all(our_endpoint == routing_table[our_node]
			
 
				+                       for our_node, our_endpoint in zip(our_knn, our_endpoints))
			
 
				 
			
 
				         # queries from table
			
 
				         for i in range(1000):
			
 
				             k = random.randint(1, 100)
			
 
				             query_id = random.choice(all_active_neighbors)
			
 
				-            our_knn, our_addrs = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=query_id))
			
 
				+            our_knn, our_endpoints = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=query_id))
			
 
				 
			
 
				             reference_knn = heapq.nsmallest(k + 1, all_active_neighbors, key=query_id.xor_distance)
			
 
				             if query_id in reference_knn:
			
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,4 +1,3 @@
 
				-#%env CUDA_VISIBLE_DEVICES=
			
 
				 import argparse
			
 
				 from typing import Optional
			
 
				 
			
@@ -6,21 +5,19 @@ import torch
 
				 import torch.nn as nn
			
 
				 import torch.nn.functional as F
			
 
				 
			
 
				-from hivemind import RemoteExpert, find_open_port
			
 
				+from hivemind import RemoteExpert, find_open_port, LOCALHOST
			
 
				 from test_utils.run_server import background_server
			
 
				 
			
 
				 from sklearn.datasets import load_digits
			
 
				 
			
 
				 
			
 
				 def test_training(port: Optional[int] = None, max_steps: int = 100, threshold: float = 0.9):
			
 
				-    if port is None:
			
 
				-        port = find_open_port()
			
 
				     dataset = load_digits()
			
 
				     X_train, y_train = torch.tensor(dataset['data'], dtype=torch.float), torch.tensor(dataset['target'])
			
 
				 
			
 
				-    with background_server(num_experts=2, device='cpu', port=port, hidden_dim=64):
			
 
				-        expert1 = RemoteExpert('expert.0', host='127.0.0.1', port=port)
			
 
				-        expert2 = RemoteExpert('expert.1', host='127.0.0.1', port=port)
			
 
				+    with background_server(num_experts=2, device='cpu', hidden_dim=64) as (server_endpoint, _):
			
 
				+        expert1 = RemoteExpert('expert.0', server_endpoint)
			
 
				+        expert2 = RemoteExpert('expert.1', server_endpoint)
			
 
				         model = nn.Sequential(expert2, nn.Tanh(), expert1, nn.Linear(64, 10))
			
 
				 
			
 
				         opt = torch.optim.SGD(model.parameters(), lr=0.05)
			
--- a/tests/test_utils/run_server.py
+++ b/tests/test_utils/run_server.py
@@ -3,20 +3,21 @@ import multiprocessing as mp
 
				 from contextlib import contextmanager
			
 
				 
			
 
				 import resource
			
 
				+from typing import Tuple
			
 
				+
			
 
				 import torch
			
 
				 
			
 
				 import hivemind
			
 
				 from test_utils.layers import name_to_block, name_to_input
			
 
				 
			
 
				 
			
 
				-def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls='ffn', hidden_dim=1024,
			
 
				+def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hidden_dim=1024,
			
 
				                       num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None,
			
 
				                       no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, root_port=None, verbose=True,
			
 
				                       UID_DELIMETER=hivemind.DHT.UID_DELIMETER, start=False, **kwargs) -> hivemind.Server:
			
 
				     """
			
 
				     Instantiate a server with several identical experts. See argparse comments below for details
			
 
				-    :param interface: 'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6
			
 
				-    :param port: main server will listen to this port, default = find open port
			
 
				+    :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
			
 
				     :param num_experts: run this many identical experts
			
 
				     :param expert_cls: expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
			
 
				     :param hidden_dim: main dimension for expert_cls
			
@@ -46,8 +47,9 @@ def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls=
 
				     if not no_dht:
			
 
				         if not len(initial_peers):
			
 
				             print("No initial peers provided. Starting additional dht as an initial peer.")
			
 
				-            dht_root = hivemind.DHT(
			
 
				-                *initial_peers, listen_on=f"{hivemind.LOCALHOST}:{root_port or hivemind.find_open_port()}", start=True)
			
 
				+            dht_root = hivemind.DHT(initial_peers=initial_peers,
			
 
				+                                    listen_on=f"{hivemind.LOCALHOST}:{root_port or hivemind.find_open_port()}",
			
 
				+                                    start=True)
			
 
				             print(f"Initializing DHT with port {dht_root.port}")
			
 
				             initial_peers = [f"{hivemind.LOCALHOST}:{dht_root.port}"]
			
 
				         else:
			
@@ -55,8 +57,9 @@ def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls=
 
				             if root_port is not None:
			
 
				                 print(f"Warning: root_port={root_port} will not be used since we already have peers.")
			
 
				 
			
 
				-        dht = hivemind.DHT(
			
 
				-            *initial_peers, listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}", start=True)
			
 
				+        dht = hivemind.DHT(initial_peers=initial_peers,
			
 
				+                           listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}",
			
 
				+                           start=True)
			
 
				         if verbose:
			
 
				             print(f"Running dht node on port {dht.port}")
			
 
				 
			
@@ -79,19 +82,19 @@ def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls=
 
				                                                      )
			
 
				     # actually start server
			
 
				     server = hivemind.Server(
			
 
				-        dht, experts, addr=interface, port=port or hivemind.find_open_port(),
			
 
				-        conn_handler_processes=num_handlers, device=device)
			
 
				+        dht, experts, listen_on=listen_on,
			
 
				+        num_connection_handlers=num_handlers, device=device)
			
 
				 
			
 
				     if start:
			
 
				         server.run_in_background(await_ready=True)
			
 
				         if verbose:
			
 
				-            print(f"Server started at {server.addr}:{server.port}")
			
 
				+            print(f"Server started at {server.listen_on}")
			
 
				             print(f"Got {num_experts} active experts of type {expert_cls}: {list(experts.keys())}")
			
 
				     return server
			
 
				 
			
 
				 
			
 
				 @contextmanager
			
 
				-def background_server(*args, shutdown_timeout=5, verbose=True, **kwargs):
			
 
				+def background_server(*args, shutdown_timeout=5, verbose=True, **kwargs) -> Tuple[hivemind.Endpoint, hivemind.Endpoint]:
			
 
				     """ A context manager that creates server in a background thread, awaits .ready on entry and shutdowns on exit """
			
 
				     pipe, runners_pipe = mp.Pipe(duplex=True)
			
 
				     runner = mp.get_context("spawn").Process(
			
@@ -115,8 +118,11 @@ def background_server(*args, shutdown_timeout=5, verbose=True, **kwargs):
 
				 def _server_runner(pipe, *args, verbose, **kwargs):
			
 
				     server = make_dummy_server(*args, verbose=verbose, start=True, **kwargs)
			
 
				     try:
			
 
				-        dht_port = server.dht.port if server.dht is not None else None
			
 
				-        pipe.send((server.addr, server.port, dht_port))
			
 
				+        if server.dht is not None:
			
 
				+            dht_listen_on = hivemind.replace_port(server.dht.listen_on, server.dht.port)
			
 
				+        else:
			
 
				+            dht_listen_on = None
			
 
				+        pipe.send((server.listen_on, dht_listen_on))
			
 
				         pipe.recv()  # wait for shutdown signal
			
 
				     finally:
			
 
				         if verbose: