5 лет назад · d961ceb6ae
--- a/docs/_static/dht.odp
+++ b/docs/_static/dht.odp
--- a/docs/_static/dht.pdf
+++ b/docs/_static/dht.pdf
--- a/docs/_static/dht.png
+++ b/docs/_static/dht.png
--- a/docs/user/quickstart.md
+++ b/docs/user/quickstart.md
@@ -35,11 +35,11 @@ do something complex with it, please contact us by opening an issue (less prefer
 
				 - **`Runtime`** (`hivemind/runtime/__init__.py`) aggregates batches
			
 
				   and performs inference/training of experts according to their priority.
			
 
				 - **`Server`** (`hivemind/server/__init__.py`) wraps runtime and
			
 
				-  periodically uploads experts into `DHTNode`.
			
 
				+  periodically uploads experts into `DHT`.
			
 
				 
			
 
				 **DHT:**
			
 
				 
			
 
				-- **`DHTNode`**(`hivemind/dht/__init__.py`) is a node of
			
 
				+- **`DHT`**(`hivemind/dht/__init__.py`) is a node of
			
 
				   Kademlia-based DHT that stores metadata used by trainer and runtime.
			
 
				 
			
 
				 ## Limitations
			
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -26,7 +26,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
				     :param grid_size: hivemind dimensions that form expert uid (see below)
			
 
				     :param uid_prefix: common prefix for all expert uids
			
 
				      expert uid follows the pattern {uid_prefix}.{0...grid_size[0]}.{0...grid_size[1]}...{0...grid_size[-1]}
			
 
				-    :param dht: DHTNode where the experts reside
			
 
				+    :param dht: DHT where the experts reside
			
 
				     :param num_workers: number of threads for parallel dht operation
			
 
				     :param k_best: queries this many experts with highest scores
			
 
				     :param k_min: makes sure at least this many experts returned output
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -1,36 +1,57 @@
 
				+"""
			
 
				+This sub-module implements a node in a Kademlia-based DHT. The code is organized as follows:
			
 
				+ * class DHT (below) - high-level class for model training. Runs DHTNode in a background process.
			
 
				+ * class DHTNode (node.py) - an asyncio implementation of dht server, stores AND gets keys. Asyncio-based.
			
 
				+ * class KademliaProtocol (protocol.py) - an rpc protocol to request data from dht nodes. Asyncio-based.
			
 
				+
			
 
				+The code in this module is a modified version of https://github.com/bmuller/kademlia
			
 
				+Brian, if you're reading this: THANK YOU! you're awesome :)
			
 
				+"""
			
 
				 import asyncio
			
 
				-import datetime
			
 
				 import multiprocessing as mp
			
 
				 import warnings
			
 
				 from typing import Tuple, List, Optional
			
 
				 
			
 
				-from kademlia.network import Server
			
 
				+from .node import DHTNode, DHTID, DHTExpiration
			
 
				+from .routing import get_dht_time
			
 
				 
			
 
				-from hivemind.client import RemoteExpert
			
 
				-from hivemind.utils import run_forever, SharedFuture, PickleSerializer
			
 
				+from ..client import RemoteExpert
			
 
				+from ..utils import SharedFuture, find_open_port, Hostname, Port, run_in_background
			
 
				 
			
 
				 
			
 
				-class DHTNode(mp.Process):
			
 
				+class DHT(mp.Process):
			
 
				+    """
			
 
				+    A high-level interface to hivemind DHT. Runs a dht node in a background process.
			
 
				+    :param initial_peers: one or multiple pairs of (host, port) pointing to active DHT peers. Default: no peers
			
 
				+    :param port: a port where DHT will listen to incoming connections. Defaults to hivemind.utils.find_open_port
			
 
				+    :param start: if True, automatically starts the background process on creation. Otherwise await manual start
			
 
				+    :param daemon: if True, the background process is marked as daemon and automatically terminated after main process
			
 
				+    :param node_params: any other params will be forwarded to DHTNode upon creation
			
 
				+    """
			
 
				     UID_DELIMETER = '.'  # splits expert uids over this delimeter
			
 
				-    HEARTBEAT_EXPIRATION = 120  # expert is inactive iff it fails to post timestamp for *this many seconds*
			
 
				+    EXPIRATION = 120  # anything written to DHT is considered expired after this many seconds
			
 
				     make_key = "{}::{}".format
			
 
				 
			
 
				-    def __init__(self, *initial_peers: Tuple[str, int], port=8081, start=False, daemon=True):
			
 
				+    def __init__(self, *initial_peers: Tuple[Hostname, Port], port: Optional[Port] = None,
			
 
				+                 start: bool, daemon: bool = True, **node_params):
			
 
				         super().__init__()
			
 
				-        self.port, self.initial_peers = port, initial_peers
			
 
				+        port = find_open_port() if port is None else port
			
 
				+        self.node: Optional[DHTNode] = None  # to be initialized in self.run
			
 
				+        self.port, self.initial_peers, self.node_params = port, initial_peers, node_params
			
 
				         self._pipe, self.pipe = mp.Pipe(duplex=False)
			
 
				         self.ready = mp.Event()
			
 
				-        self.server = Server()
			
 
				         self.daemon = daemon
			
 
				         if start:
			
 
				             self.run_in_background(await_ready=True)
			
 
				 
			
 
				     def run(self) -> None:
			
 
				+        if asyncio.get_event_loop().is_running():
			
 
				+            asyncio.get_event_loop().stop()  # if we're in jupyter, get rid of its built-in event loop
			
 
				         loop = asyncio.new_event_loop()
			
 
				         asyncio.set_event_loop(loop)
			
 
				-        loop.run_until_complete(self.server.listen(self.port))
			
 
				-        loop.run_until_complete(self.server.bootstrap(self.initial_peers))
			
 
				-        run_forever(loop.run_forever)
			
 
				+
			
 
				+        self.node = DHTNode(initial_peers=list(self.initial_peers), port=self.port, **self.node_params)
			
 
				+        run_in_background(loop.run_forever)
			
 
				         self.ready.set()
			
 
				 
			
 
				         while True:
			
@@ -39,7 +60,7 @@ class DHTNode(mp.Process):
 
				 
			
 
				     def run_in_background(self, await_ready=True, timeout=None):
			
 
				         """
			
 
				-        Starts DHTNode in a background process. if await_ready, this method will wait until background dht
			
 
				+        Starts DHT in a background process. if await_ready, this method will wait until background dht
			
 
				         is ready to process incoming requests or for :timeout: seconds max.
			
 
				         """
			
 
				         self.start()
			
@@ -53,98 +74,106 @@ class DHTNode(mp.Process):
 
				         else:
			
 
				             warnings.warn("DHT shutdown has no effect: dht process is already not alive")
			
 
				 
			
 
				-    def get_experts(self, uids: List[str], heartbeat_expiration=HEARTBEAT_EXPIRATION) -> List[Optional[RemoteExpert]]:
			
 
				-        """ Find experts across DHT using their ids; Return a list of [RemoteExpert if found else None]"""
			
 
				+    def get_experts(self, uids: List[str], expiration=None) -> List[Optional[RemoteExpert]]:
			
 
				+        """
			
 
				+        :param uids: find experts with these ids from across the DHT
			
 
				+        :param expiration: returns experts that expire no sooner than this (based on get_dht_time), default = now
			
 
				+        :returns: a list of [RemoteExpert if found else None]
			
 
				+        """
			
 
				         future, _future = SharedFuture.make_pair()
			
 
				-        self.pipe.send(('_get_experts', [], dict(uids=uids, heartbeat_expiration=heartbeat_expiration, future=_future)))
			
 
				+        self.pipe.send(('_get_experts', [], dict(uids=uids, expiration=expiration, future=_future)))
			
 
				         return future.result()
			
 
				 
			
 
				-    def _get_experts(self, uids: List[str], heartbeat_expiration: float, future: SharedFuture):
			
 
				+    def _get_experts(self, uids: List[str], expiration: Optional[DHTExpiration], future: SharedFuture):
			
 
				         loop = asyncio.get_event_loop()
			
 
				+        expiration = expiration or get_dht_time()
			
 
				+
			
 
				         lookup_futures = [asyncio.run_coroutine_threadsafe(
			
 
				-            self.server.get(self.make_key('expert', uid)), loop) for uid in uids]
			
 
				-        current_time = datetime.datetime.now()
			
 
				+            self.node.get(self.make_key('expert', uid), expiration), loop) for uid in uids]
			
 
				 
			
 
				-        experts = [None] * len(uids)
			
 
				+        experts: List[Optional[RemoteExpert]] = [None] * len(uids)
			
 
				         for i, (uid, lookup) in enumerate(zip(uids, lookup_futures)):
			
 
				-            if lookup.result() is not None:
			
 
				-                (host, port), timestamp = PickleSerializer.loads(lookup.result())
			
 
				-                if (current_time - timestamp).total_seconds() <= heartbeat_expiration:
			
 
				-                    experts[i] = RemoteExpert(uid=uid, host=host, port=port)
			
 
				+            maybe_result, maybe_expiration = lookup.result()
			
 
				+            if maybe_expiration is not None:  # if we found a value
			
 
				+                experts[i] = RemoteExpert(uid=uid, host=maybe_result[0], port=maybe_result[1])
			
 
				 
			
 
				         future.set_result(experts)
			
 
				 
			
 
				-    def declare_experts(self, uids: List[str], addr, port, wait_timeout=0):
			
 
				+    def declare_experts(self, uids: List[str], addr, port, wait=True, timeout=None) -> Optional[List[bool]]:
			
 
				         """
			
 
				         Make experts available to DHT; update timestamps if already available
			
 
				         :param uids: a list of expert ids to update
			
 
				         :param addr: hostname that can be used to call this expert
			
 
				         :param port: port that can be used to call this expert
			
 
				-        :param wait_timeout: if wait_timeout > 0, waits for the procedure to finish
			
 
				+        :param wait: if True, awaits for declaration to finish, otherwise runs in background
			
 
				+        :param timeout: waits for the procedure to finish, None means wait indeninitely
			
 
				+        :returns: if wait, returns a list of booleans, (True = store succeeded, False = store rejected)
			
 
				         """
			
 
				-        done_event = mp.Event() if wait_timeout else None
			
 
				-        self.pipe.send(('_declare_experts', [], dict(uids=list(uids), addr=addr, port=port, done_event=done_event)))
			
 
				-        if done_event is not None:
			
 
				-            done_event.wait(wait_timeout)
			
 
				+        future, _future = SharedFuture.make_pair() if wait else (None, None)
			
 
				+        self.pipe.send(('_declare_experts', [], dict(uids=list(uids), addr=addr, port=port, future=_future)))
			
 
				+        if wait:
			
 
				+            return future.result(timeout)
			
 
				 
			
 
				-    def _declare_experts(self, uids: List[str], addr: str, port: int, done_event: Optional[mp.Event]):
			
 
				+    def _declare_experts(self, uids: List[str], addr: str, port: int, future: Optional[SharedFuture]):
			
 
				+        assert self.node is not None, "This method should only be accessed from inside .run method"
			
 
				         loop = asyncio.get_event_loop()
			
 
				-        timestamp = datetime.datetime.now()
			
 
				-        expert_metadata = PickleSerializer.dumps(((addr, port), timestamp))
			
 
				-        prefix_metadata = PickleSerializer.dumps(timestamp)
			
 
				-
			
 
				+        expiration_time = get_dht_time() + self.EXPIRATION
			
 
				         unique_prefixes = set()
			
 
				+        coroutines = []
			
 
				 
			
 
				         for uid in uids:
			
 
				-            asyncio.run_coroutine_threadsafe(self.server.set(self.make_key('expert', uid), expert_metadata), loop)
			
 
				+            coroutines.append(asyncio.run_coroutine_threadsafe(
			
 
				+                self.node.store(self.make_key('expert', uid), value=(addr, port),
			
 
				+                                expiration_time=expiration_time),
			
 
				+                loop))
			
 
				             uid_parts = uid.split(self.UID_DELIMETER)
			
 
				             unique_prefixes.update([self.UID_DELIMETER.join(uid_parts[:i + 1]) for i in range(len(uid_parts))])
			
 
				 
			
 
				         for prefix in unique_prefixes:
			
 
				-            asyncio.run_coroutine_threadsafe(self.server.set(self.make_key('prefix', prefix), prefix_metadata), loop)
			
 
				+            coroutines.append(asyncio.run_coroutine_threadsafe(
			
 
				+                self.node.store(self.make_key('prefix', prefix), True, expiration_time), loop))
			
 
				 
			
 
				-        if done_event is not None:
			
 
				-            done_event.set()
			
 
				+        if future is not None:
			
 
				+            future.set_result([coro.result() for coro in coroutines])  # wait for all coroutings to finish
			
 
				 
			
 
				-    def first_k_active(self, prefixes: List[str], k: int, heartbeat_expiration=HEARTBEAT_EXPIRATION, max_prefetch=None):
			
 
				+    def first_k_active(self, prefixes: List[str], k: int, max_prefetch=None):
			
 
				         """
			
 
				         Find k prefixes with active experts; may return less if there aren't enough; used for DMoE beam search
			
 
				         :param prefixes: a list of uid prefixes ordered from highest to lowest priority
			
 
				         :param k: return at most *this many* active prefixes
			
 
				-        :param heartbeat_expiration: consider expert active if his last heartbeat was sent at most this many seconds ago
			
 
				         :param max_prefetch: pre-dispatch up to *this many* asynchronous expert requests, defaults to pre-dispatch = k
			
 
				         :returns: a list of at most :k: prefixes that have at least one active expert each;
			
 
				         """
			
 
				+        assert isinstance(prefixes, (list, tuple)), "please provide a list/tuple of prefixes as the first argument"
			
 
				         future, _future = SharedFuture.make_pair()
			
 
				-        self.pipe.send(('_first_k_active', [], dict(prefixes=prefixes, k=k, heartbeat_expiration=heartbeat_expiration,
			
 
				-                                                    max_prefetch=max_prefetch or k, future=_future)))
			
 
				+        self.pipe.send(('_first_k_active', [],
			
 
				+                        dict(prefixes=prefixes, k=k, max_prefetch=max_prefetch or k, future=_future)))
			
 
				         return future.result()
			
 
				 
			
 
				-    def _first_k_active(self, prefixes: List[str], k, heartbeat_expiration, max_prefetch, future: SharedFuture):
			
 
				+    def _first_k_active(self, prefixes: List[str], k: int, max_prefetch: Optional[int], future: SharedFuture):
			
 
				+        assert self.node is not None, "This method should only be accessed from inside .run method"
			
 
				+        max_prefetch = max_prefetch or len(prefixes)
			
 
				         loop = asyncio.get_event_loop()
			
 
				-        lookup_prefetch = [asyncio.run_coroutine_threadsafe(
			
 
				-            self.server.get(self.make_key('prefix', prefix)), loop) for prefix in prefixes[:max_prefetch]]
			
 
				-        current_time = datetime.datetime.now()
			
 
				-
			
 
				+        lookup_prefetch = [asyncio.run_coroutine_threadsafe(self.node.get(self.make_key('prefix', prefix)), loop)
			
 
				+                           for prefix in prefixes[:max_prefetch]]
			
 
				         active_prefixes = []
			
 
				 
			
 
				         for i, prefix in enumerate(prefixes):
			
 
				-            lookup = lookup_prefetch[i]
			
 
				+            _, maybe_expiration = lookup_prefetch[i].result()
			
 
				 
			
 
				-            if lookup.result() is not None:
			
 
				-                timestamp = PickleSerializer.loads(lookup.result())
			
 
				-                if (current_time - timestamp).total_seconds() <= heartbeat_expiration:
			
 
				-                    active_prefixes.append(prefix)
			
 
				-                    if len(active_prefixes) >= k:
			
 
				-                        future.set_result(active_prefixes)
			
 
				-                        return
			
 
				+            if maybe_expiration is not None:
			
 
				+                active_prefixes.append(prefix)
			
 
				+                if len(active_prefixes) >= k:
			
 
				+                    future.set_result(active_prefixes)
			
 
				+                    for task in lookup_prefetch[i:]:
			
 
				+                        task.cancel()
			
 
				+                    return
			
 
				 
			
 
				             # pre-dispatch the next request in line
			
 
				             if len(lookup_prefetch) < len(prefixes):
			
 
				                 lookup_prefetch.append(
			
 
				-                    asyncio.run_coroutine_threadsafe(self.server.get(
			
 
				-                        self.make_key('prefix', prefixes[len(lookup_prefetch)])), loop)
			
 
				-                )
			
 
				+                    asyncio.run_coroutine_threadsafe(
			
 
				+                        self.node.get(self.make_key('prefix', prefixes[len(lookup_prefetch)])), loop))
			
 
				 
			
 
				         # could not find enough active prefixes; return what we can
			
 
				         future.set_result(active_prefixes)
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -0,0 +1,205 @@
 
				+import asyncio
			
 
				+from collections import OrderedDict
			
 
				+from functools import partial
			
 
				+from typing import Optional, Tuple, List, Dict
			
 
				+from warnings import warn
			
 
				+
			
 
				+from .protocol import KademliaProtocol
			
 
				+from .routing import DHTID, DHTValue, DHTExpiration, DHTKey, get_dht_time
			
 
				+from .search import traverse_dht
			
 
				+from ..utils import find_open_port, Endpoint, Hostname, Port, LOCALHOST
			
 
				+
			
 
				+
			
 
				+class DHTNode:
			
 
				+    """
			
 
				+    A low-level class that represents a DHT participant.
			
 
				+    Each DHTNode has an identifier, a local storage and access too other nodes via KademliaProtocol.
			
 
				+
			
 
				+    :param node_id: current node's identifier, determines which keys it will store locally, defaults to random id
			
 
				+    :param port: port to which this DHTNode will listen, by default find some open port
			
 
				+    :param initial_peers: connects to these peers to populate routing table, defaults to no peers
			
 
				+    :param bucket_size: (k) - max number of nodes in one k-bucket. Trying to add {k+1}st node will cause a bucket to
			
 
				+      either split in two buckets along the midpoint or reject the new node (but still save it as a replacement)
			
 
				+      Recommended value: $k$ is chosen s.t. any given k nodes are very unlikely to all fail after staleness_timeout
			
 
				+    :param num_replicas: (≈k) - number of nearest nodes that will be asked to store a given key, default = bucket_size
			
 
				+    :param depth_modulo: (b) - kademlia can split bucket if it contains root OR up to the nearest multiple of this value
			
 
				+    :param wait_timeout: a kademlia rpc request is deemed lost if we did not recieve a reply in this many seconds
			
 
				+    :param staleness_timeout: a bucket is considered stale if no node from that bucket was updated in this many seconds
			
 
				+    :param bootstrap_timeout: after one of peers responds, await other peers for at most this many seconds
			
 
				+    :param interface: provide 0.0.0.0 to operate over ipv4, :: to operate over ipv6, localhost to operate locally, etc.
			
 
				+
			
 
				+    :note: Hivemind DHT is optimized to store temporary metadata that is regularly updated.
			
 
				+     For example, an expert alive timestamp that emitted by the Server responsible for that expert.
			
 
				+     Such metadata does not require maintenance such as ensuring at least k hosts have it or (de)serialization in case
			
 
				+     of node shutdown. Instead, DHTNode is designed to reduce the latency of looking up such data.
			
 
				+
			
 
				+    Every (key, value) pair in this DHT has expiration_time - float number computed as get_dht_time(), default: UnixTime
			
 
				+    Informally, dht nodes always prefer values with higher expiration_time and may delete any value past its expiration.
			
 
				+
			
 
				+    Formally, DHTNode follows this contract:
			
 
				+      - when asked to store(key, value, expiration_time), a node must store (key, value) at least until expiration_time
			
 
				+       unless it already stores that key with greater or equal expiration_time - if so, node must keep the previous key
			
 
				+      - when asked to get(key), a node must return the value with highest expiration time IF that time has not come yet
			
 
				+       if expiration time is greater than current get_dht_time(), DHTNode *may* return None
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, node_id: Optional[DHTID] = None, port: Optional[Port] = None, initial_peers: List[Endpoint] = (),
			
 
				+                 bucket_size: int = 20, num_replicas: Optional[int] = None, depth_modulo: int = 5,
			
 
				+                 wait_timeout: float = 5, staleness_timeout: Optional[float] = 600,
			
 
				+                 bootstrap_timeout: Optional[float] = None, cache_locally: bool = True, cache_nearest: int = 1,
			
 
				+                 interface: Hostname = '0.0.0.0'):
			
 
				+        self.node_id = node_id = node_id if node_id is not None else DHTID.generate()
			
 
				+        self.port = port = port if port is not None else find_open_port()
			
 
				+        self.num_replicas = num_replicas if num_replicas is not None else bucket_size
			
 
				+        self.cache_locally, self.cache_nearest = cache_locally, cache_nearest
			
 
				+        self.staleness_timeout = staleness_timeout
			
 
				+
			
 
				+        # create kademlia protocol and make it listen to a port
			
 
				+        loop = asyncio.get_event_loop()
			
 
				+        make_protocol = partial(KademliaProtocol, self.node_id, bucket_size, depth_modulo, wait_timeout)
			
 
				+        listener = loop.run_until_complete(loop.create_datagram_endpoint(make_protocol, local_addr=(interface, port)))
			
 
				+        self.transport: asyncio.Transport = listener[0]
			
 
				+        self.protocol: KademliaProtocol = listener[1]
			
 
				+
			
 
				+        if initial_peers:
			
 
				+            # bootstrap part 1: ping initial_peers, add each other to the routing table
			
 
				+            bootstrap_timeout = bootstrap_timeout if bootstrap_timeout is not None else wait_timeout
			
 
				+            began_bootstrap_time = get_dht_time()
			
 
				+            ping_tasks = map(self.protocol.call_ping, initial_peers)
			
 
				+            finished_tasks, remaining_tasks = loop.run_until_complete(
			
 
				+                asyncio.wait(ping_tasks, timeout=wait_timeout, return_when=asyncio.FIRST_COMPLETED))
			
 
				+            time_to_first_response = get_dht_time() - began_bootstrap_time
			
 
				+            # bootstrap part 2: gather all peers who responded within bootstrap_timeout, but at least one peer
			
 
				+            if remaining_tasks:
			
 
				+                finished_in_time, stragglers = loop.run_until_complete(
			
 
				+                    asyncio.wait(remaining_tasks, timeout=bootstrap_timeout - time_to_first_response))
			
 
				+                for straggler in stragglers:
			
 
				+                    straggler.cancel()
			
 
				+                finished_tasks |= finished_in_time
			
 
				+
			
 
				+            peer_ids = [task.result() for task in finished_tasks if task.result() is not None]
			
 
				+            if len(peer_ids) == 0 and len(initial_peers) != 0:
			
 
				+                warn("DHTNode bootstrap failed: none of the initial_peers responded to a ping.")
			
 
				+
			
 
				+            # bootstrap part 3: run beam search for my node id to add my own nearest neighbors to the routing table
			
 
				+            # ... and maybe receive some values that we are meant to store (see protocol.update_routing_table)
			
 
				+            loop.run_until_complete(self.find_nearest_nodes(query_id=self.node_id))
			
 
				+
			
 
				+    async def find_nearest_nodes(self, query_id: DHTID, k_nearest: Optional[int] = None,
			
 
				+                                 beam_size: Optional[int] = None, exclude_self: bool = False) -> Dict[DHTID, Endpoint]:
			
 
				+        """
			
 
				+        Traverse the DHT and find :k_nearest: nodes to a given :query_id:, optionally :exclude_self: from the results.
			
 
				+        :note: this is a thin wrapper over dht.search.beam_search, look there for more details
			
 
				+        :returns: an ordered dictionary of [peer DHTID -> network Endpoint], ordered from nearest to farthest neighbor
			
 
				+        """
			
 
				+        k_nearest = k_nearest if k_nearest is not None else self.protocol.bucket_size
			
 
				+        beam_size = beam_size if beam_size is not None else max(self.protocol.bucket_size, k_nearest)
			
 
				+        node_to_addr = dict(
			
 
				+            self.protocol.routing_table.get_nearest_neighbors(query_id, beam_size, exclude=self.node_id))
			
 
				+
			
 
				+        async def get_neighbors(node: DHTID) -> Tuple[List[DHTID], bool]:
			
 
				+            peers: Dict[DHTID, Endpoint] = await self.protocol.call_find_node(node_to_addr[node], query_id)
			
 
				+            node_to_addr.update(peers)
			
 
				+            return list(peers.keys()), False  # False means "do not interrupt beam search"
			
 
				+
			
 
				+        nearest_nodes, visited_nodes = await traverse_dht(
			
 
				+            query_id=query_id, initial_nodes=list(node_to_addr), k_nearest=k_nearest, beam_size=beam_size,
			
 
				+            get_neighbors=get_neighbors, visited_nodes=(self.node_id,))
			
 
				+
			
 
				+        if not exclude_self:
			
 
				+            nearest_nodes = sorted(nearest_nodes + [self.node_id], key=query_id.xor_distance)[:k_nearest]
			
 
				+            node_to_addr[self.node_id] = (LOCALHOST, self.port)
			
 
				+
			
 
				+        return OrderedDict((node, node_to_addr[node]) for node in nearest_nodes)
			
 
				+
			
 
				+    async def store(self, key: DHTKey, value: DHTValue, expiration_time: DHTExpiration) -> bool:
			
 
				+        """
			
 
				+        Find beam_size best nodes to store (key, value) and store it there at least until expiration time.
			
 
				+        Also cache (key, value, expiration_time) at all nodes you met along the way (see Section 2.1 end)
			
 
				+        :return: True if store succeeds, False if it fails (due to no response or newer value)
			
 
				+        """
			
 
				+        key_id = DHTID.generate(key)
			
 
				+        nearest_node_to_addr = await self.find_nearest_nodes(key_id, k_nearest=self.num_replicas, exclude_self=True)
			
 
				+        tasks = [asyncio.create_task(self.protocol.call_store(endpoint, key_id, value, expiration_time))
			
 
				+                 for endpoint in nearest_node_to_addr.values()]
			
 
				+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
			
 
				+        return any(done)
			
 
				+
			
 
				+    async def get(self, key: DHTKey, sufficient_expiration_time: Optional[DHTExpiration] = None,
			
 
				+                  beam_size: Optional[int] = None) -> Tuple[Optional[DHTValue], Optional[DHTExpiration]]:
			
 
				+        """
			
 
				+        :param key: traverse the DHT and find the value for this key (or return None if it does not exist)
			
 
				+        :param sufficient_expiration_time: if the search finds a value that expires after this time,
			
 
				+            default = time of call, find any value that did not expire by the time of call
			
 
				+            If min_expiration_time=float('inf'), this method will find a value with _latest_ expiration
			
 
				+        :returns: value and its expiration time. If nothing is found , returns (None, None).
			
 
				+        :note: in order to check if get returned a value, please check (expiration_time is None)
			
 
				+        """
			
 
				+        key_id = DHTID.generate(key)
			
 
				+        sufficient_expiration_time = sufficient_expiration_time or get_dht_time()
			
 
				+        beam_size = beam_size if beam_size is not None else self.protocol.bucket_size
			
 
				+        latest_value, latest_expiration, latest_node_id = None, -float('inf'), None
			
 
				+        node_to_addr, nodes_checked_for_value = dict(), set()
			
 
				+        should_cache = False  # True if found value in DHT that is newer than local value
			
 
				+
			
 
				+        # Option A: value can be stored in our local cache
			
 
				+        maybe_value, maybe_expiration = self.protocol.storage.get(key_id)
			
 
				+        if maybe_expiration is None:
			
 
				+            maybe_value, maybe_expiration = self.protocol.cache.get(key_id)
			
 
				+        if maybe_expiration is not None and maybe_expiration > latest_expiration:
			
 
				+            latest_value, latest_expiration, latest_node_id = maybe_value, maybe_expiration, self.node_id
			
 
				+            # TODO(jheuristic) we may want to run background beam search to update our cache
			
 
				+        nodes_checked_for_value.add(self.node_id)
			
 
				+
			
 
				+        # Option B: go beam search the DHT
			
 
				+        if latest_expiration < sufficient_expiration_time:
			
 
				+            node_to_addr.update(self.protocol.routing_table.get_nearest_neighbors(
			
 
				+                key_id, self.protocol.bucket_size, exclude=self.node_id))
			
 
				+
			
 
				+            async def get_neighbors(node: DHTID) -> Tuple[List[DHTID], bool]:
			
 
				+                nonlocal latest_value, latest_expiration, node_to_addr, nodes_checked_for_value
			
 
				+                maybe_value, maybe_expiration, peers = await self.protocol.call_find_value(node_to_addr[node], key_id)
			
 
				+                node_to_addr.update(peers)
			
 
				+                nodes_checked_for_value.add(node)
			
 
				+                if maybe_expiration is not None and maybe_expiration > latest_expiration:
			
 
				+                    latest_value, latest_expiration, latest_node_id = maybe_value, maybe_expiration, node
			
 
				+                should_interrupt = (latest_expiration >= sufficient_expiration_time)
			
 
				+                return list(peers.keys()), should_interrupt
			
 
				+
			
 
				+            nearest_nodes, visited_nodes = await traverse_dht(
			
 
				+                query_id=key_id, initial_nodes=list(node_to_addr), k_nearest=beam_size, beam_size=beam_size,
			
 
				+                get_neighbors=get_neighbors, visited_nodes=nodes_checked_for_value)
			
 
				+            # normally, by this point we will have found a sufficiently recent value in one of get_neighbors calls
			
 
				+            should_cache = latest_expiration >= sufficient_expiration_time  # if we found a newer value, cache it later
			
 
				+
			
 
				+        # Option C: didn't find good-enough value in beam search, make a last-ditch effort to find it in unvisited nodes
			
 
				+        if latest_expiration < sufficient_expiration_time:
			
 
				+            nearest_unvisited = [node_id for node_id in nearest_nodes if node_id not in nodes_checked_for_value]
			
 
				+            tasks = [self.protocol.call_find_value(node_to_addr[node_id], key_id) for node_id in nearest_unvisited]
			
 
				+            pending_tasks = set(tasks)
			
 
				+            for task in asyncio.as_completed(tasks):
			
 
				+                pending_tasks.remove(task)
			
 
				+                maybe_value, maybe_expiration, _ = await task
			
 
				+                if maybe_expiration is not None and maybe_expiration > latest_expiration:
			
 
				+                    latest_value, latest_expiration = maybe_value, maybe_expiration
			
 
				+                    if latest_expiration >= sufficient_expiration_time:
			
 
				+                        break
			
 
				+            for task in pending_tasks:
			
 
				+                task.close()
			
 
				+            should_cache = latest_expiration >= sufficient_expiration_time  # if we found a newer value, cache it later
			
 
				+
			
 
				+        # step 4: we have not found entry with sufficient_expiration_time, but we may have found *something* older
			
 
				+        if should_cache and self.cache_locally:
			
 
				+            self.protocol.cache.store(key_id, latest_value, latest_expiration)
			
 
				+        if should_cache and self.cache_nearest:
			
 
				+            num_cached_nodes = 0
			
 
				+            for node_id in nearest_nodes:
			
 
				+                if node_id == latest_node_id:
			
 
				+                    continue
			
 
				+                asyncio.create_task(self.protocol.call_store(
			
 
				+                    node_to_addr[node_id], key_id, latest_value, latest_expiration, in_cache=True))
			
 
				+                num_cached_nodes += 1
			
 
				+                if num_cached_nodes >= self.cache_nearest:
			
 
				+                    break
			
 
				+
			
 
				+        return (latest_value, latest_expiration) if latest_expiration != -float('inf') else (None, None)
			
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -0,0 +1,196 @@
 
				+import asyncio
			
 
				+import heapq
			
 
				+from typing import Optional, List, Tuple, Dict
			
 
				+from rpcudp.protocol import RPCProtocol
			
 
				+
			
 
				+from .routing import RoutingTable, DHTID, DHTValue, DHTExpiration, BinaryDHTID, get_dht_time
			
 
				+from ..utils import Endpoint
			
 
				+
			
 
				+
			
 
				+class KademliaProtocol(RPCProtocol):
			
 
				+    """
			
 
				+    A protocol that allows DHT nodes to request keys/neighbors from other DHT nodes.
			
 
				+    As a side-effect, KademliaProtocol also maintains a routing table as described in
			
 
				+    https://pdos.csail.mit.edu/~petar/papers/maymounkov-kademlia-lncs.pdf
			
 
				+
			
 
				+    See DHTNode (node.py) for a more detailed description.
			
 
				+
			
 
				+    :note: the rpc_* methods defined in this class will be automatically exposed to other DHT nodes,
			
 
				+     for instance, def rpc_ping can be called as protocol.call_ping(addr, dht_id) from a remote machine
			
 
				+     Only the call_* methods are meant to be called publicly, e.g. from DHTNode
			
 
				+     Read more: https://github.com/bmuller/rpcudp/tree/master/rpcudp
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, node_id: DHTID, bucket_size: int, depth_modulo: int,
			
 
				+                 wait_timeout: float, cache_size: Optional[int] = None):
			
 
				+        super().__init__(wait_timeout)
			
 
				+        self.node_id, self.bucket_size = node_id, bucket_size
			
 
				+        self.routing_table = RoutingTable(node_id, bucket_size, depth_modulo)
			
 
				+        self.storage = LocalStorage()
			
 
				+        self.cache = LocalStorage(maxsize=cache_size)
			
 
				+
			
 
				+    def rpc_ping(self, sender: Endpoint, sender_id_bytes: BinaryDHTID) -> BinaryDHTID:
			
 
				+        """ Some dht node wants us to add it to our routing table. """
			
 
				+        asyncio.ensure_future(self.update_routing_table(DHTID.from_bytes(sender_id_bytes), sender))
			
 
				+        return bytes(self.node_id)
			
 
				+
			
 
				+    async def call_ping(self, recipient: Endpoint) -> Optional[DHTID]:
			
 
				+        """ Get recipient's node id and add him to the routing table. If recipient doesn't respond, return None """
			
 
				+        responded, response = await self.ping(recipient, bytes(self.node_id))
			
 
				+        recipient_node_id = DHTID.from_bytes(response) if responded else None
			
 
				+        asyncio.ensure_future(self.update_routing_table(recipient_node_id, recipient, responded=responded))
			
 
				+        return recipient_node_id
			
 
				+
			
 
				+    def rpc_store(self, sender: Endpoint, sender_id_bytes: BinaryDHTID, key_bytes: BinaryDHTID,
			
 
				+                  value: DHTValue, expiration_time: DHTExpiration, in_cache: bool) -> Tuple[bool, BinaryDHTID]:
			
 
				+        """ Some node wants us to store this (key, value) pair """
			
 
				+        asyncio.ensure_future(self.update_routing_table(DHTID.from_bytes(sender_id_bytes), sender))
			
 
				+        if in_cache:
			
 
				+            store_accepted = self.cache.store(DHTID.from_bytes(key_bytes), value, expiration_time)
			
 
				+        else:
			
 
				+            store_accepted = self.storage.store(DHTID.from_bytes(key_bytes), value, expiration_time)
			
 
				+        return store_accepted, bytes(self.node_id)
			
 
				+
			
 
				+    async def call_store(self, recipient: Endpoint, key: DHTID, value: DHTValue,
			
 
				+                         expiration_time: DHTExpiration, in_cache: bool = False) -> Optional[bool]:
			
 
				+        """
			
 
				+        Ask a recipient to store (key, value) pair until expiration time or update their older value
			
 
				+        :returns: True if value was accepted, False if it was rejected (recipient has newer value), None if no response
			
 
				+        """
			
 
				+        responded, response = await self.store(recipient, bytes(self.node_id), bytes(key),
			
 
				+                                               value, expiration_time, in_cache)
			
 
				+        if responded:
			
 
				+            store_accepted, recipient_node_id = response[0], DHTID.from_bytes(response[1])
			
 
				+            asyncio.ensure_future(self.update_routing_table(recipient_node_id, recipient, responded=responded))
			
 
				+            return store_accepted
			
 
				+        return None
			
 
				+
			
 
				+    def rpc_find_node(self, sender: Endpoint, sender_id_bytes: BinaryDHTID,
			
 
				+                      query_id_bytes: BinaryDHTID) -> Tuple[List[Tuple[BinaryDHTID, Endpoint]], BinaryDHTID]:
			
 
				+        """
			
 
				+        Someone wants to find :key_node: in the DHT. Give him k nearest neighbors from our routing table
			
 
				+        :returns: a list of pairs (node_id, address) of :bucket_size: nearest to key_node according to XOR distance,
			
 
				+         also returns our own node id for routing table maintenance
			
 
				+        """
			
 
				+        query_id, sender_id = DHTID.from_bytes(query_id_bytes), DHTID.from_bytes(sender_id_bytes)
			
 
				+        asyncio.ensure_future(self.update_routing_table(sender_id, sender))
			
 
				+        peer_ids_and_addr = self.routing_table.get_nearest_neighbors(query_id, k=self.bucket_size, exclude=sender_id)
			
 
				+        return [(bytes(peer_id), peer_addr) for peer_id, peer_addr in peer_ids_and_addr], bytes(self.node_id)
			
 
				+
			
 
				+    async def call_find_node(self, recipient: Endpoint, query_id: DHTID) -> Dict[DHTID, Endpoint]:
			
 
				+        """
			
 
				+        Ask a recipient to give you nearest neighbors to key_node. If recipient knows key_node directly,
			
 
				+         it will be returned as first of the neighbors; if recipient does not respond, return empty dict.
			
 
				+        :returns: a dicitionary[node id => address] as per Section 2.3 of the paper
			
 
				+        """
			
 
				+        responded, response = await self.find_node(recipient, bytes(self.node_id), bytes(query_id))
			
 
				+        if responded:
			
 
				+            peers = {DHTID.from_bytes(peer_id_bytes): tuple(addr) for peer_id_bytes, addr in response[0]}
			
 
				+            # Note: we convert addr from list to tuple here --^ because some msgpack versions convert tuples to lists
			
 
				+            recipient_node_id = DHTID.from_bytes(response[1])
			
 
				+            asyncio.ensure_future(self.update_routing_table(recipient_node_id, recipient, responded=responded))
			
 
				+            return peers
			
 
				+        return {}
			
 
				+
			
 
				+    def rpc_find_value(self, sender: Endpoint, sender_id_bytes: BinaryDHTID, key_bytes: BinaryDHTID) -> \
			
 
				+            Tuple[Optional[DHTValue], Optional[DHTExpiration], List[Tuple[BinaryDHTID, Endpoint]], BinaryDHTID]:
			
 
				+        """
			
 
				+        Someone wants to find value corresponding to key. If we have the value, return the value and its expiration time
			
 
				+         Either way, return :bucket_size: nearest neighbors to that node.
			
 
				+        :note: this is a deviation from Section 2.3 of the paper, original kademlia returner EITHER value OR neighbors
			
 
				+        :returns: (value or None if we have no value, nearest neighbors, our own dht id)
			
 
				+        """
			
 
				+        maybe_value, maybe_expiration = self.storage.get(DHTID.from_bytes(key_bytes))
			
 
				+        cached_value, cached_expiration = self.cache.get(DHTID.from_bytes(key_bytes))
			
 
				+        if (cached_expiration or -float('inf')) > (maybe_expiration or -float('inf')):
			
 
				+            maybe_value, maybe_expiration = cached_value, cached_expiration
			
 
				+        nearest_neighbors, my_id = self.rpc_find_node(sender, sender_id_bytes, key_bytes)
			
 
				+        return maybe_value, maybe_expiration, nearest_neighbors, my_id
			
 
				+
			
 
				+    async def call_find_value(self, recipient: Endpoint, key: DHTID) -> \
			
 
				+            Tuple[Optional[DHTValue], Optional[DHTExpiration], Dict[DHTID, Endpoint]]:
			
 
				+        """
			
 
				+        Ask a recipient to give you the value, if it has one, or nearest neighbors to your key.
			
 
				+        :returns: (optional value, optional expiration time, and neighbors)
			
 
				+         value: whatever was the latest value stored by the recipient with that key (see DHTNode contract)
			
 
				+         expiration time: expiration time of the returned value, None if no value was found
			
 
				+         neighbors:  a dictionary[node id => address] as per Section 2.3 of the paper;
			
 
				+        Note: if no response, returns None, None, {}
			
 
				+        """
			
 
				+        responded, response = await self.find_value(recipient, bytes(self.node_id), bytes(key))
			
 
				+        if responded:
			
 
				+            (value, expiration_time, peers_bytes), recipient_id = response[:-1], DHTID.from_bytes(response[-1])
			
 
				+            peers = {DHTID.from_bytes(peer_id_bytes): tuple(addr) for peer_id_bytes, addr in peers_bytes}
			
 
				+            asyncio.ensure_future(self.update_routing_table(recipient_id, recipient, responded=responded))
			
 
				+            return value, expiration_time, peers
			
 
				+        return None, None, {}
			
 
				+
			
 
				+    async def update_routing_table(self, node_id: Optional[DHTID], addr: Endpoint, responded=True):
			
 
				+        """
			
 
				+        This method is called on every incoming AND outgoing request to update the routing table
			
 
				+        :param addr: sender endpoint for incoming requests, recipient endpoint for outgoing requests
			
 
				+        :param node_id: sender node id for incoming requests, recipient node id for outgoing requests
			
 
				+        :param responded: for outgoing requests, this indicated whether recipient responded or not.
			
 
				+          For incoming requests, this should always be True
			
 
				+        """
			
 
				+        if responded:  # incoming request or outgoing request with response
			
 
				+            maybe_node_to_ping = self.routing_table.add_or_update_node(node_id, addr)
			
 
				+            if maybe_node_to_ping is not None:
			
 
				+                # we couldn't add new node because the table was full. Check if existing peers are alive (Section 2.2)
			
 
				+                # ping one least-recently updated peer: if it won't respond, remove it from the table, else update it
			
 
				+                await self.call_ping(maybe_node_to_ping[1])  # [1]-th element is that node's endpoint
			
 
				+
			
 
				+        else:  # outgoing request and peer did not respond
			
 
				+            if node_id is not None and node_id in self.routing_table:
			
 
				+                del self.routing_table[node_id]
			
 
				+
			
 
				+    def _accept_response(self, msg_id, data, address):
			
 
				+        """ Override for RPCProtocol._accept_response to handle cancelled tasks """
			
 
				+        future, timeout = self._outstanding[msg_id]
			
 
				+        if future.cancelled():
			
 
				+            timeout.cancel()
			
 
				+            del self._outstanding[msg_id]
			
 
				+        else:
			
 
				+            super()._accept_response(msg_id, data, address)
			
 
				+
			
 
				+
			
 
				+
			
 
				+class LocalStorage:
			
 
				+    def __init__(self, maxsize: Optional[int] = None):
			
 
				+        self.cache_size = maxsize or float("inf")
			
 
				+        self.data = dict()
			
 
				+        self.expiration_heap = []
			
 
				+        self.key_to_heap = dict()
			
 
				+
			
 
				+    def remove_outdated(self):
			
 
				+        while self.expiration_heap and (self.expiration_heap[0][0] < get_dht_time()
			
 
				+                                        or len(self.expiration_heap) > self.cache_size):
			
 
				+            heap_entry = heapq.heappop(self.expiration_heap)
			
 
				+            key = heap_entry[1]
			
 
				+            if self.key_to_heap[key] == heap_entry:
			
 
				+                del self.data[key], self.key_to_heap[key]
			
 
				+
			
 
				+    def store(self, key: DHTID, value: DHTValue, expiration_time: DHTExpiration) -> bool:
			
 
				+        """
			
 
				+        Store a (key, value) pair locally at least until expiration_time. See class docstring for details.
			
 
				+        :returns: True if new value was stored, False it was rejected (current value is newer)
			
 
				+        """
			
 
				+        if expiration_time < get_dht_time():
			
 
				+            return False
			
 
				+        self.key_to_heap[key] = (expiration_time, key)
			
 
				+        heapq.heappush(self.expiration_heap, (expiration_time, key))
			
 
				+        if key in self.data:
			
 
				+            if self.data[key][1] < expiration_time:
			
 
				+                self.data[key] = (value, expiration_time)
			
 
				+                return True
			
 
				+            return False
			
 
				+        self.data[key] = (value, expiration_time)
			
 
				+        self.remove_outdated()
			
 
				+        return True
			
 
				+
			
 
				+    def get(self, key: DHTID) -> (Optional[DHTValue], Optional[DHTExpiration]):
			
 
				+        """ Get a value corresponding to a key if that (key, value) pair was previously stored here. """
			
 
				+        self.remove_outdated()
			
 
				+        if key in self.data:
			
 
				+            return self.data[key]
			
 
				+        return None, None
			
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -0,0 +1,266 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import hashlib
			
 
				+import os
			
 
				+import random
			
 
				+
			
 
				+import time
			
 
				+import heapq
			
 
				+from collections.abc import Iterable
			
 
				+from itertools import chain
			
 
				+from typing import Tuple, Optional, List, Dict, Set, Union, Any, Sequence, Iterator
			
 
				+
			
 
				+from ..utils import Endpoint, PickleSerializer
			
 
				+
			
 
				+
			
 
				+class RoutingTable:
			
 
				+    """
			
 
				+    A data structure that contains DHT peers bucketed according to their distance to node_id
			
 
				+    :param node_id: node id used to measure distance
			
 
				+    :param bucket_size: parameter $k$ from Kademlia paper Section 2.2
			
 
				+    :param depth_modulo: parameter $b$ from Kademlia paper Section 2.2.
			
 
				+    :note: you can find a more detailed docstring for Node class, see node.py
			
 
				+    :note: kademlia paper refers to https://pdos.csail.mit.edu/~petar/papers/maymounkov-kademlia-lncs.pdf
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, node_id: DHTID, bucket_size: int, depth_modulo: int):
			
 
				+        self.node_id, self.bucket_size, self.depth_modulo = node_id, bucket_size, depth_modulo
			
 
				+        self.buckets = [KBucket(node_id.MIN, node_id.MAX, bucket_size)]
			
 
				+
			
 
				+    def get_bucket_index(self, node_id: DHTID) -> int:
			
 
				+        """ Get the index of the bucket that the given node would fall into. """
			
 
				+        # TODO use binsearch aka from bisect import bisect.
			
 
				+        for index, bucket in enumerate(self.buckets):
			
 
				+            if bucket.lower <= node_id < bucket.upper:
			
 
				+                return index
			
 
				+        raise ValueError(f"Failed to get bucket for node_id={node_id}, this should not be possible.")
			
 
				+
			
 
				+    def add_or_update_node(self, node_id: DHTID, addr: Endpoint) -> Optional[Tuple[DHTID, Endpoint]]:
			
 
				+        """
			
 
				+        Update routing table after an incoming request from :addr: (host:port) or outgoing request to :addr:
			
 
				+        :returns: If we cannot add node_id to the routing table, return the least-recently-updated node (Section 2.2)
			
 
				+        :note: KademliaProtocol calls this method for every incoming and outgoing request if there was a response.
			
 
				+          If this method returned a node to be ping-ed, the protocol will ping it to check and either move it to
			
 
				+          the start of the table or remove that node and replace it with
			
 
				+        """
			
 
				+        bucket_index = self.get_bucket_index(node_id)
			
 
				+        bucket = self.buckets[bucket_index]
			
 
				+
			
 
				+        if bucket.add_or_update_node(node_id, addr):
			
 
				+            return  # this will succeed unless the bucket is full
			
 
				+
			
 
				+        # Per section 4.2 of paper, split if the bucket has node's own id in its range
			
 
				+        # or if bucket depth is not congruent to 0 mod $b$
			
 
				+        if bucket.has_in_range(self.node_id) or bucket.depth % self.depth_modulo != 0:
			
 
				+            self.split_bucket(bucket_index)
			
 
				+            return self.add_or_update_node(node_id, addr)
			
 
				+
			
 
				+        # The bucket is full and won't split further. Return a node to ping (see this method's docstring)
			
 
				+        return bucket.request_ping_node()
			
 
				+
			
 
				+    def split_bucket(self, index: int) -> None:
			
 
				+        """ Split bucket range in two equal parts and reassign nodes to the appropriate half """
			
 
				+        first, second = self.buckets[index].split()
			
 
				+        self.buckets[index] = first
			
 
				+        self.buckets.insert(index + 1, second)
			
 
				+
			
 
				+    def __getitem__(self, node_id: DHTID) -> Endpoint:
			
 
				+        return self.buckets[self.get_bucket_index(node_id)][node_id]
			
 
				+
			
 
				+    def __setitem__(self, node_id: DHTID, addr: Endpoint) -> NotImplementedError:
			
 
				+        raise NotImplementedError("KBucket doesn't support direct item assignment. Use KBucket.try_add_node instead")
			
 
				+
			
 
				+    def __contains__(self, node_id: DHTID) -> bool:
			
 
				+        return node_id in self.buckets[self.get_bucket_index(node_id)]
			
 
				+
			
 
				+    def __delitem__(self, node_id: DHTID):
			
 
				+        node_bucket = self.buckets[self.get_bucket_index(node_id)]
			
 
				+        del node_bucket[node_id]
			
 
				+
			
 
				+    def get_nearest_neighbors(
			
 
				+            self, query_id: DHTID, k: int, exclude: Optional[DHTID] = None) -> List[Tuple[DHTID, Endpoint]]:
			
 
				+        """
			
 
				+        Find k nearest neighbors from routing table according to XOR distance, does NOT include self.node_id
			
 
				+        :param query_id: find neighbors of this node
			
 
				+        :param k: find this many neighbors. If there aren't enough nodes in the table, returns all nodes
			
 
				+        :param exclude: if True, results will not contain query_node_id even if it is in table
			
 
				+        :returns: a list of tuples (node_id, endpoint) for up to k neighbors sorted from nearest to farthest
			
 
				+
			
 
				+        :note: this is a semi-exhaustive search of nodes that takes O(n * log k) time.
			
 
				+            One can implement a more efficient knn search using a binary skip-tree in some
			
 
				+            more elegant language such as c++ / cython / numba.
			
 
				+            Here's a sketch
			
 
				+
			
 
				+            Preparation: construct a non-regular binary tree of depth (2 * DHTID.HASH_NBYTES)
			
 
				+             Each leaf corresponds to a binary DHTID with '0' for every left turn and '1' for right turn
			
 
				+             Each non-leaf corresponds to a certain prefix, e.g. 0010110???...???
			
 
				+             If there are no nodes under some prefix xxxY???..., the corresponding node xxx????...
			
 
				+             will only have one child.
			
 
				+            Add(node):
			
 
				+             Traverse down a tree, on i-th level go left if node_i == 0, right if node_i == 1
			
 
				+             If the corresponding node is missing, simply create it
			
 
				+            Search(query, k):
			
 
				+             Traverse the tree with a depth-first search, on i-th level go left if query_i == 0, else right
			
 
				+             If the corresponding node is missing, go the other way. Proceed until you found a leaf.
			
 
				+             This leaf is your nearest neighbor. Now add more neighbors by considering alternative paths
			
 
				+             bottom-up, i.e. if your nearest neighbor is 01011, first try 01010, then 0100x, then 011xx, ...
			
 
				+
			
 
				+            This results in O(num_nodes * bit_length) complexity for add and search
			
 
				+            Better yet: use binary tree with skips for O(num_nodes * log(num_nodes))
			
 
				+        """
			
 
				+        all_nodes: Iterator[Tuple[DHTID, Endpoint]] = chain(*self.buckets)  # uses KBucket.__iter__
			
 
				+        nearest_nodes_with_addr: List[Tuple[DHTID, Endpoint]] = heapq.nsmallest(
			
 
				+            k + int(exclude is not None), all_nodes, lambda id_and_endpoint: query_id.xor_distance(id_and_endpoint[0]))
			
 
				+        if exclude is not None:
			
 
				+            for i, (node_i, addr_i) in enumerate(list(nearest_nodes_with_addr)):
			
 
				+                if node_i == exclude:
			
 
				+                    del nearest_nodes_with_addr[i]
			
 
				+                    break
			
 
				+            if len(nearest_nodes_with_addr) > k:
			
 
				+                nearest_nodes_with_addr.pop()  # if excluded element is not among (k + 1) nearest, simply crop to k
			
 
				+        return nearest_nodes_with_addr
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        bucket_info = "\n".join(repr(bucket) for bucket in self.buckets)
			
 
				+        return f"{self.__class__.__name__}(node_id={self.node_id}, bucket_size={self.bucket_size}," \
			
 
				+               f" modulo={self.depth_modulo},\nbuckets=[\n{bucket_info})"
			
 
				+
			
 
				+
			
 
				+class KBucket:
			
 
				+    """
			
 
				+    A bucket containing up to :size: of DHTIDs in [lower, upper) semi-interval.
			
 
				+    Maps DHT node ids to their endpoints (hostname, addr)
			
 
				+    """
			
 
				+    def __init__(self, lower: int, upper: int, size: int, depth: int = 0):
			
 
				+        assert upper - lower == 2 ** (DHTID.HASH_NBYTES * 8 - depth)
			
 
				+        self.lower, self.upper, self.size, self.depth = lower, upper, size, depth
			
 
				+        self.nodes_to_addr: Dict[DHTID, Endpoint] = {}
			
 
				+        self.replacement_nodes: Dict[DHTID, Endpoint] = {}
			
 
				+        self.nodes_requested_for_ping: Set[DHTID] = set()
			
 
				+        self.last_updated = get_dht_time()
			
 
				+
			
 
				+    def has_in_range(self, node_id: DHTID):
			
 
				+        """ Check if node_id is between this bucket's lower and upper bounds """
			
 
				+        return self.lower <= node_id < self.upper
			
 
				+
			
 
				+    def add_or_update_node(self, node_id: DHTID, addr: Endpoint) -> bool:
			
 
				+        """
			
 
				+        Add node to KBucket or update existing node, return True if successful, False if the bucket is full.
			
 
				+        If the bucket is full, keep track of node in a replacement list, per section 4.1 of the paper.
			
 
				+        :param node_id: dht node identifier that should be added or moved to the front of bucket
			
 
				+        :param addr: a pair of (hostname, port) associated with that node id
			
 
				+        :note: this function has a side-effect of resetting KBucket.last_updated time
			
 
				+        """
			
 
				+        if node_id in self.nodes_requested_for_ping:
			
 
				+            self.nodes_requested_for_ping.remove(node_id)
			
 
				+        self.last_updated = get_dht_time()
			
 
				+        if node_id in self.nodes_to_addr:
			
 
				+            del self.nodes_to_addr[node_id]
			
 
				+            self.nodes_to_addr[node_id] = addr
			
 
				+        elif len(self) < self.size:
			
 
				+            self.nodes_to_addr[node_id] = addr
			
 
				+        else:
			
 
				+            if node_id in self.replacement_nodes:
			
 
				+                del self.replacement_nodes[node_id]
			
 
				+            self.replacement_nodes[node_id] = addr
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    def request_ping_node(self) -> Optional[Tuple[DHTID, Endpoint]]:
			
 
				+        """ :returns: least-recently updated node that isn't already being pinged right now -- if such node exists """
			
 
				+        for uid, endpoint in self.nodes_to_addr.items():
			
 
				+            if uid not in self.nodes_requested_for_ping:
			
 
				+                return uid, endpoint
			
 
				+
			
 
				+    def __getitem__(self, node_id: DHTID) -> Endpoint:
			
 
				+        return self.nodes_to_addr[node_id] if node_id in self.nodes_to_addr else self.replacement_nodes[node_id]
			
 
				+
			
 
				+    def __delitem__(self, node_id: DHTID):
			
 
				+        if not (node_id in self.nodes_to_addr or node_id in self.replacement_nodes):
			
 
				+            raise KeyError(f"KBucket does not contain node id={node_id}.")
			
 
				+
			
 
				+        if node_id in self.replacement_nodes:
			
 
				+            del self.replacement_nodes[node_id]
			
 
				+
			
 
				+        if node_id in self.nodes_to_addr:
			
 
				+            del self.nodes_to_addr[node_id]
			
 
				+
			
 
				+            if self.replacement_nodes:
			
 
				+                newnode_id, newnode = self.replacement_nodes.popitem()
			
 
				+                self.nodes_to_addr[newnode_id] = newnode
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        return len(self.nodes_to_addr)
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        return iter(self.nodes_to_addr.items())
			
 
				+
			
 
				+    def split(self) -> Tuple[KBucket, KBucket]:
			
 
				+        """ Split bucket over midpoint, rounded down, assign nodes to according to their id """
			
 
				+        midpoint = (self.lower + self.upper) // 2
			
 
				+        assert self.lower < midpoint < self.upper, f"Bucket to small to be split: [{self.lower}: {self.upper})"
			
 
				+        left = KBucket(self.lower, midpoint, self.size, depth=self.depth + 1)
			
 
				+        right = KBucket(midpoint, self.upper, self.size, depth=self.depth + 1)
			
 
				+        for node_id, addr in chain(self.nodes_to_addr.items(), self.replacement_nodes.items()):
			
 
				+            bucket = left if int(node_id) <= midpoint else right
			
 
				+            bucket.add_or_update_node(node_id, addr)
			
 
				+        return left, right
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return f"{self.__class__.__name__}({len(self.nodes_to_addr)} nodes" \
			
 
				+               f" with {len(self.replacement_nodes)} replacements, depth={self.depth}, max size={self.size}" \
			
 
				+               f" lower={hex(self.lower)}, upper={hex(self.upper)})"
			
 
				+
			
 
				+
			
 
				+class DHTID(int):
			
 
				+    HASH_FUNC = hashlib.sha1
			
 
				+    HASH_NBYTES = 20  # SHA1 produces a 20-byte (aka 160bit) number
			
 
				+    RANGE = MIN, MAX = 0, 2 ** (HASH_NBYTES * 8)  # inclusive min, exclusive max
			
 
				+
			
 
				+    def __new__(cls, value: int):
			
 
				+        assert cls.MIN <= value < cls.MAX, f"DHTID must be in [{cls.MIN}, {cls.MAX}) but got {value}"
			
 
				+        return super().__new__(cls, value)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def generate(cls, source: Optional[Any] = None, nbits: int = 255):
			
 
				+        """
			
 
				+        Generates random uid based on SHA1
			
 
				+        :param source: if provided, converts this value to bytes and uses it as input for hashing function;
			
 
				+            by default, generates a random dhtid from :nbits: random bits
			
 
				+        """
			
 
				+        source = random.getrandbits(nbits).to_bytes(nbits, byteorder='big') if source is None else source
			
 
				+        source = PickleSerializer.dumps(source) if not isinstance(source, bytes) else source
			
 
				+        raw_uid = cls.HASH_FUNC(source).digest()
			
 
				+        return cls(int(raw_uid.hex(), 16))
			
 
				+
			
 
				+    def xor_distance(self, other: Union[DHTID, Sequence[DHTID]]) -> Union[int, List[int]]:
			
 
				+        """
			
 
				+        :param other: one or multiple DHTIDs. If given multiple DHTIDs as other, this function
			
 
				+         will compute distance from self to each of DHTIDs in other.
			
 
				+        :return: a number or a list of numbers whose binary representations equal bitwise xor between DHTIDs.
			
 
				+        """
			
 
				+        if isinstance(other, Iterable):
			
 
				+            return list(map(self.xor_distance, other))  # TODO make some SIMD
			
 
				+        return int(self) ^ int(other)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def longest_common_prefix_length(cls, *ids: DHTID) -> int:
			
 
				+        ids_bits = [bin(uid)[2:].rjust(8 * cls.HASH_NBYTES, '0') for uid in ids]
			
 
				+        return len(os.path.commonprefix(ids_bits))
			
 
				+
			
 
				+    def to_bytes(self, length=HASH_NBYTES, byteorder='big', *, signed=False) -> bytes:
			
 
				+        return super().to_bytes(length, byteorder, signed=signed)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_bytes(self, bytes, byteorder='big', *, signed=False) -> DHTID:
			
 
				+        return DHTID(super().from_bytes(bytes, byteorder=byteorder, signed=signed))
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return f"{self.__class__.__name__}({hex(self)})"
			
 
				+
			
 
				+    def __bytes__(self):
			
 
				+        return self.to_bytes()
			
 
				+
			
 
				+
			
 
				+DHTKey, DHTValue, DHTExpiration, BinaryDHTID = Any, Any, float, bytes  # flavour types
			
 
				+get_dht_time = time.time  # time used by all dht functionality. You can replace this with any infrastructure-wide time
			
--- a/hivemind/dht/search.py
+++ b/hivemind/dht/search.py
@@ -0,0 +1,62 @@
 
				+import heapq
			
 
				+from typing import Collection, Callable, Tuple, List, Awaitable, Set
			
 
				+from warnings import warn
			
 
				+
			
 
				+from .routing import DHTID
			
 
				+
			
 
				+
			
 
				+async def traverse_dht(query_id: DHTID, initial_nodes: Collection[DHTID], k_nearest: int, beam_size: int,
			
 
				+                       get_neighbors: Callable[[DHTID], Awaitable[Tuple[Collection[DHTID], bool]]],
			
 
				+                       visited_nodes: Collection[DHTID] = ()) -> Tuple[List[DHTID], Set[DHTID]]:
			
 
				+    """
			
 
				+    Asynchronous beam search over the DHT. Not meant to be called by the user, please use DHTNode.store/get instead.
			
 
				+    Traverse the DHT graph using get_neighbors function, find up to k_nearest nodes according to DHTID.xor_distance.
			
 
				+    Approximate time complexity: O(T * log T) where T = (path_to_true_nearest + beam_size) * mean_num_neighbors
			
 
				+
			
 
				+    :param query_id: search query, find k_nearest neighbors of this DHTID
			
 
				+    :param initial_nodes: nodes used to pre-populate beam search heap, e.g. [my_own_DHTID, *maybe_some_peers]
			
 
				+    :param k_nearest: find up to this many nearest neighbors. If there are less nodes in the DHT, return all nodes
			
 
				+    :param beam_size: beam search will not give up until it exhausts this many nearest nodes (to query_id) from the heap
			
 
				+        Recommended value: A beam size of k_nearest * (2-5) will yield near-perfect results.
			
 
				+
			
 
				+    :param get_neighbors: A function that returns neighbors of a given node and controls beam search stopping criteria.
			
 
				+        async def get_neighbors(node: DHTID) -> neighbors_of_that_node: List[DHTID], should_continue: bool
			
 
				+        If should_continue is False, beam search will halt and return k_nearest of whatever it found by then.
			
 
				+
			
 
				+    :param visited_nodes: beam search will neither call get_neighbors on these nodes, nor return them as nearest
			
 
				+    :returns: a list of k nearest nodes (nearest to farthest), and a set of all visited nodes (including visited_nodes)
			
 
				+    """
			
 
				+    if beam_size < k_nearest:
			
 
				+        warn(f"beam search: beam_size({beam_size}) is too small, beam search may fail to find k neighbors.")
			
 
				+    visited_nodes = set(visited_nodes)  # note: copy visited_nodes because we will add more nodes to this collection.
			
 
				+    initial_nodes = [node_id for node_id in initial_nodes if node_id not in visited_nodes]
			
 
				+    if not initial_nodes:
			
 
				+        return [], visited_nodes
			
 
				+
			
 
				+    unvisited_nodes = [(distance, uid) for uid, distance in zip(initial_nodes, query_id.xor_distance(initial_nodes))]
			
 
				+    heapq.heapify(unvisited_nodes)  # nearest-first heap of candidates, unlimited size
			
 
				+
			
 
				+    nearest_nodes = [(-distance, node_id) for distance, node_id in heapq.nsmallest(beam_size, unvisited_nodes)]
			
 
				+    heapq.heapify(nearest_nodes)  # farthest-first heap of size beam_size, used for early-stopping and to select results
			
 
				+    while len(nearest_nodes) > beam_size:
			
 
				+        heapq.heappop(nearest_nodes)
			
 
				+
			
 
				+    visited_nodes |= set(initial_nodes)
			
 
				+    upper_bound = -nearest_nodes[0][0]  # distance to farthest element that is still in beam
			
 
				+    was_interrupted = False  # will set to True if host triggered beam search to stop via get_neighbors
			
 
				+
			
 
				+    while (not was_interrupted) and len(unvisited_nodes) != 0 and unvisited_nodes[0][0] <= upper_bound:
			
 
				+        _, node_id = heapq.heappop(unvisited_nodes)  # note: this  --^ is the smallest element in heap (see heapq)
			
 
				+        neighbors, was_interrupted = await get_neighbors(node_id)
			
 
				+        neighbors = [node_id for node_id in neighbors if node_id not in visited_nodes]
			
 
				+        visited_nodes.update(neighbors)
			
 
				+
			
 
				+        for neighbor_id, distance in zip(neighbors, query_id.xor_distance(neighbors)):
			
 
				+            if distance <= upper_bound or len(nearest_nodes) < beam_size:
			
 
				+                heapq.heappush(unvisited_nodes, (distance, neighbor_id))
			
 
				+
			
 
				+                heapq_add_or_replace = heapq.heappush if len(nearest_nodes) < beam_size else heapq.heappushpop
			
 
				+                heapq_add_or_replace(nearest_nodes, (-distance, neighbor_id))
			
 
				+                upper_bound = -nearest_nodes[0][0]  # distance to beam_size-th nearest element found so far
			
 
				+
			
 
				+    return [node_id for _, node_id in heapq.nlargest(k_nearest, nearest_nodes)], visited_nodes
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -2,11 +2,11 @@ import multiprocessing as mp
 
				 import os
			
 
				 import threading
			
 
				 from socket import socket, AF_INET, SOCK_STREAM, SO_REUSEADDR, SOL_SOCKET, timeout
			
 
				-from typing import Dict
			
 
				+from typing import Dict, Optional
			
 
				 
			
 
				 from .connection_handler import handle_connection
			
 
				 from .dht_handler import DHTHandlerThread
			
 
				-from ..dht import DHTNode
			
 
				+from ..dht import DHT
			
 
				 from ..runtime import Runtime, ExpertBackend
			
 
				 
			
 
				 
			
@@ -20,7 +20,7 @@ class Server(threading.Thread):
 
				      - publishes updates to expert status every :update_period: seconds
			
 
				      - follows orders from HivemindController - if it exists
			
 
				 
			
 
				-    :type dht: DHTNode or None. Server with dht=None will NOT be visible from DHT,
			
 
				+    :type dht: DHT or None. Server with dht=None will NOT be visible from DHT,
			
 
				      but it will still support accessing experts directly with RemoteExpert(uid=UID, host=IPADDR, port=PORT).
			
 
				     :param expert_backends: dict{expert uid (str) : ExpertBackend} for all expert hosted by this server.
			
 
				     :param addr: server's dht address that determines how it can be accessed. Default is local connections only.
			
@@ -33,7 +33,7 @@ class Server(threading.Thread):
 
				         is ready (see .ready below)
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, dht: DHTNode, expert_backends: Dict[str, ExpertBackend], addr='127.0.0.1',
			
 
				+    def __init__(self, dht: Optional[DHT], expert_backends: Dict[str, ExpertBackend], addr='127.0.0.1',
			
 
				                  port: int = 8080, conn_handler_processes: int = 1, update_period: int = 30, start=False,
			
 
				                  **kwargs):
			
 
				         super().__init__()
			
--- a/hivemind/server/dht_handler.py
+++ b/hivemind/server/dht_handler.py
@@ -1,11 +1,11 @@
 
				 import threading
			
 
				 import time
			
 
				 
			
 
				-from ..dht import DHTNode
			
 
				+from ..dht import DHT
			
 
				 
			
 
				 
			
 
				 class DHTHandlerThread(threading.Thread):
			
 
				-    def __init__(self, experts, dht: DHTNode,
			
 
				+    def __init__(self, experts, dht: DHT,
			
 
				                  update_period: int = 5, addr: str = '127.0.0.1', port: int = 8080):
			
 
				         super(DHTHandlerThread, self).__init__()
			
 
				         self.port = port
			
--- a/hivemind/utils/connection.py
+++ b/hivemind/utils/connection.py
@@ -1,7 +1,11 @@
 
				-from contextlib import AbstractContextManager
			
 
				-from socket import socket
			
 
				+import socket
			
 
				+from contextlib import AbstractContextManager, closing
			
 
				 from typing import Tuple
			
 
				 
			
 
				+Hostname, Port = str, int  # flavour types
			
 
				+Endpoint = Tuple[Hostname, Port]  # https://networkengineering.stackexchange.com/a/9435
			
 
				+LOCALHOST = '127.0.0.1'
			
 
				+
			
 
				 
			
 
				 class Connection(AbstractContextManager):
			
 
				     header_size = 4  # number of characters in all headers
			
@@ -9,12 +13,12 @@ class Connection(AbstractContextManager):
 
				 
			
 
				     __slots__ = ('conn', 'addr')
			
 
				 
			
 
				-    def __init__(self, conn: socket, addr: Tuple[str, int]):
			
 
				+    def __init__(self, conn: socket, addr: Endpoint):
			
 
				         self.conn, self.addr = conn, addr
			
 
				 
			
 
				     @staticmethod
			
 
				     def create(host: str, port: int):
			
 
				-        sock = socket()
			
 
				+        sock = socket.socket()
			
 
				         addr = (host, port)
			
 
				         sock.connect(addr)
			
 
				         return Connection(sock, addr)
			
@@ -54,10 +58,12 @@ class Connection(AbstractContextManager):
 
				         self.conn.close()
			
 
				 
			
 
				 
			
 
				-def find_open_port():
			
 
				+def find_open_port(params=(socket.AF_INET, socket.SOCK_STREAM), opt=(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)):
			
 
				+    """ Finds a tcp port that can be occupied with a socket with *params and use *opt options """
			
 
				     try:
			
 
				-        sock = socket()
			
 
				-        sock.bind(('', 0))
			
 
				-        return sock.getsockname()[1]
			
 
				-    except:
			
 
				-        raise ValueError("Could not find open port")
			
 
				+        with closing(socket.socket(*params)) as sock:
			
 
				+            sock.bind(('', 0))
			
 
				+            sock.setsockopt(*opt)
			
 
				+            return sock.getsockname()[1]
			
 
				+    except Exception as e:
			
 
				+        raise e
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,6 @@ joblib>=0.13
 
				 numpy>=1.17
			
 
				 requests>=2.22.0
			
 
				 tqdm
			
 
				-kademlia>=2.2
			
 
				+rpcudp>=4.0.0
			
 
				 prefetch_generator>=1.0.1
			
 
				 nose>=1.3.0
			
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -0,0 +1,104 @@
 
				+import argparse
			
 
				+import time
			
 
				+import asyncio
			
 
				+import multiprocessing as mp
			
 
				+import random
			
 
				+
			
 
				+import hivemind
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+from hivemind import get_dht_time
			
 
				+from hivemind.dht.node import DHTID, Endpoint, DHTNode, LOCALHOST
			
 
				+
			
 
				+
			
 
				+def run_benchmark_node(node_id, port, peers, ready: mp.Event, request_perod,
			
 
				+                       expiration_time, wait_before_read, time_to_test, statistics: mp.Queue, dht_loaded: mp.Event):
			
 
				+    if asyncio.get_event_loop().is_running():
			
 
				+        asyncio.get_event_loop().stop()  # if we're in jupyter, get rid of its built-in event loop
			
 
				+    loop = asyncio.new_event_loop()
			
 
				+    asyncio.set_event_loop(loop)
			
 
				+    node = DHTNode(node_id, port, initial_peers=peers)
			
 
				+    await_forever = hivemind.run_forever(asyncio.get_event_loop().run_forever)
			
 
				+    ready.set()
			
 
				+    dht_loaded.wait()
			
 
				+    start = time.perf_counter()
			
 
				+    while time.perf_counter() < start + time_to_test:
			
 
				+        query_id = DHTID.generate()
			
 
				+        store_value = random.randint(0, 256)
			
 
				+
			
 
				+        store_time = time.perf_counter()
			
 
				+        success_store = asyncio.run_coroutine_threadsafe(
			
 
				+            node.store(query_id, store_value, get_dht_time() + expiration_time), loop).result()
			
 
				+        store_time = time.perf_counter() - store_time
			
 
				+        if success_store:
			
 
				+            time.sleep(wait_before_read)
			
 
				+            get_time = time.perf_counter()
			
 
				+            get_value, get_time_expiration = asyncio.run_coroutine_threadsafe(node.get(query_id), loop).result()
			
 
				+            get_time = time.perf_counter() - get_time
			
 
				+            success_get = (get_value == store_value)
			
 
				+            statistics.put((success_store, store_time, success_get, get_time))
			
 
				+        else:
			
 
				+            statistics.put((success_store, store_time, None, None))
			
 
				+    await_forever.result()  # process will exit only if event loop broke down
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('--num_nodes', type=int, default=20, required=False)
			
 
				+    parser.add_argument('--request_perod', type=float, default=2, required=False)
			
 
				+    parser.add_argument('--expiration_time', type=float, default=10, required=False)
			
 
				+    parser.add_argument('--wait_before_read', type=float, default=1, required=False)
			
 
				+    parser.add_argument('--time_to_test', type=float, default=10, required=False)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    statistics = mp.Queue()
			
 
				+    dht: Dict[Endpoint, DHTID] = {}
			
 
				+    processes: List[mp.Process] = []
			
 
				+
			
 
				+    num_nodes = args.num_nodes
			
 
				+    request_perod = args.request_perod
			
 
				+    expiration_time = args.expiration_time
			
 
				+    wait_before_read = args.wait_before_read
			
 
				+    time_to_test = args.time_to_test
			
 
				+
			
 
				+    dht_loaded = mp.Event()
			
 
				+    for i in range(num_nodes):
			
 
				+        node_id = DHTID.generate()
			
 
				+        port = hivemind.find_open_port()
			
 
				+        peers = random.sample(dht.keys(), min(len(dht), 5))
			
 
				+        ready = mp.Event()
			
 
				+        proc = mp.Process(target=run_benchmark_node, args=(node_id, port, peers, ready, request_perod,
			
 
				+                                                           expiration_time, wait_before_read, time_to_test, statistics,
			
 
				+                                                           dht_loaded), daemon=True)
			
 
				+        proc.start()
			
 
				+        ready.wait()
			
 
				+        processes.append(proc)
			
 
				+        dht[(LOCALHOST, port)] = node_id
			
 
				+    dht_loaded.set()
			
 
				+    time.sleep(time_to_test)
			
 
				+    success_store = 0
			
 
				+    all_store = 0
			
 
				+    time_store = 0
			
 
				+    success_get = 0
			
 
				+    all_get = 0
			
 
				+    time_get = 0
			
 
				+    while not statistics.empty():
			
 
				+        success_store_i, store_time_i, success_get_i, get_time_i = statistics.get()
			
 
				+        all_store += 1
			
 
				+        time_store += store_time_i
			
 
				+        if success_store_i:
			
 
				+            success_store += 1
			
 
				+            all_get += 1
			
 
				+            success_get += 1 if success_get_i else 0
			
 
				+            time_get += get_time_i
			
 
				+    alive_nodes_count = 0
			
 
				+    loop = asyncio.new_event_loop()
			
 
				+    node = DHTNode(loop=loop)
			
 
				+    for addr, port in dht:
			
 
				+        if loop.run_until_complete(node.protocol.call_ping((addr, port))) is not None:
			
 
				+            alive_nodes_count += 1
			
 
				+    print("store success rate: ", success_store / all_store)
			
 
				+    print("mean store time: ", time_store / all_store)
			
 
				+    print("get success rate: ", success_get / all_get)
			
 
				+    print("mean get time: ", time_get / all_get)
			
 
				+    print("death rate: ", (num_nodes - alive_nodes_count) / num_nodes)
			
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -0,0 +1,276 @@
 
				+import time
			
 
				+import asyncio
			
 
				+import multiprocessing as mp
			
 
				+import random
			
 
				+import heapq
			
 
				+import uuid
			
 
				+from functools import partial
			
 
				+from itertools import chain
			
 
				+from typing import Optional
			
 
				+import numpy as np
			
 
				+
			
 
				+import hivemind
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+from hivemind import get_dht_time
			
 
				+from hivemind.dht.node import DHTID, Endpoint, DHTNode, LOCALHOST, KademliaProtocol
			
 
				+from hivemind.dht.protocol import LocalStorage
			
 
				+
			
 
				+
			
 
				+def run_protocol_listener(port: int, dhtid: DHTID, started: mp.synchronize.Event,
			
 
				+                          ping: Optional[hivemind.Endpoint] = None):
			
 
				+    loop = asyncio.new_event_loop()
			
 
				+    protocol = partial(KademliaProtocol, dhtid, bucket_size=20, depth_modulo=5, wait_timeout=5)
			
 
				+    listen = loop.create_datagram_endpoint(protocol, local_addr=('127.0.0.1', port))
			
 
				+    transport, protocol = loop.run_until_complete(listen)
			
 
				+    print(f"Started peer id={protocol.node_id} port={port}", flush=True)
			
 
				+
			
 
				+    if ping is not None:
			
 
				+        loop.run_until_complete(protocol.call_ping(ping))
			
 
				+    started.set()
			
 
				+    loop.run_forever()
			
 
				+    print(f"Finished peer id={protocol.node_id} port={port}", flush=True)
			
 
				+
			
 
				+
			
 
				+def test_kademlia_protocol():
			
 
				+    try:
			
 
				+        # create the first peer
			
 
				+        peer1_port, peer1_id, peer1_started = hivemind.find_open_port(), DHTID.generate(), mp.Event()
			
 
				+        peer1_proc = mp.Process(target=run_protocol_listener, args=(peer1_port, peer1_id, peer1_started), daemon=True)
			
 
				+        peer1_proc.start(), peer1_started.wait()
			
 
				+
			
 
				+        # create another peer that connects to the first peer
			
 
				+        peer2_port, peer2_id, peer2_started = hivemind.find_open_port(), DHTID.generate(), mp.Event()
			
 
				+        peer2_proc = mp.Process(target=run_protocol_listener, args=(peer2_port, peer2_id, peer2_started),
			
 
				+                                kwargs={'ping': ('127.0.0.1', peer1_port)}, daemon=True)
			
 
				+        peer2_proc.start(), peer2_started.wait()
			
 
				+
			
 
				+        port = hivemind.find_open_port()
			
 
				+        loop = asyncio.new_event_loop()
			
 
				+        protocol = partial(KademliaProtocol, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5)
			
 
				+        listen = loop.create_datagram_endpoint(protocol, local_addr=('127.0.0.1', port))
			
 
				+        transport, protocol = loop.run_until_complete(listen)
			
 
				+        print(f"Self id={protocol.node_id} port={port}", flush=True)
			
 
				+
			
 
				+        assert loop.run_until_complete(protocol.call_ping(('127.0.0.1', peer1_port))) == peer1_id
			
 
				+
			
 
				+        key, value, expiration = DHTID.generate(), [123, {'ololo': 'pyshpysh'}], get_dht_time() + 1e3
			
 
				+        assert loop.run_until_complete(protocol.call_store(('127.0.0.1', peer1_port), key, value, expiration))
			
 
				+
			
 
				+        # peer 1 must know about peer 2
			
 
				+        nodes_found = loop.run_until_complete(
			
 
				+            protocol.call_find_node(('127.0.0.1', peer1_port), key))
			
 
				+        (recv_id, recv_endpoint) = next(iter(nodes_found.items()))
			
 
				+        assert recv_id == peer2_id and recv_endpoint == ('127.0.0.1', peer2_port), \
			
 
				+            f"expected id={peer2_id}, port={('127.0.0.1', peer2_port)} but got {recv_id}, {recv_endpoint}"
			
 
				+
			
 
				+        # peer 2 must know about peer 1
			
 
				+        nodes_found_2 = loop.run_until_complete(protocol.call_find_node(('127.0.0.1', peer2_port), key))
			
 
				+        (recv_id, recv_endpoint) = next(iter(nodes_found_2.items()))
			
 
				+        assert recv_id == peer1_id and recv_endpoint == ('127.0.0.1', peer1_port), \
			
 
				+            f"expected id={peer1_id}, port={('127.0.0.1', peer1_port)} but got {recv_id}, {recv_endpoint}"
			
 
				+
			
 
				+        recv_value, recv_expiration, recv_peers = loop.run_until_complete(
			
 
				+            protocol.call_find_value(('127.0.0.1', peer1_port), key))
			
 
				+        assert recv_value == value and recv_expiration == expiration, "call_find_value expected " \
			
 
				+                                                                      f"{value} (expires by {expiration}) but got {recv_value} (expires by {recv_expiration})"
			
 
				+        print(recv_peers, nodes_found)
			
 
				+        assert recv_peers == nodes_found, "call_find_value must return the same peers as call_find_node"
			
 
				+        print("Kademlia test finished sucessfully!")
			
 
				+
			
 
				+    finally:
			
 
				+        peer1_proc.terminate()
			
 
				+        peer2_proc.terminate()
			
 
				+
			
 
				+
			
 
				+def run_node(node_id, port, peers, status_pipe: mp.Pipe):
			
 
				+    if asyncio.get_event_loop().is_running():
			
 
				+        asyncio.get_event_loop().stop()  # if we're in jupyter, get rid of its built-in event loop
			
 
				+    asyncio.set_event_loop(asyncio.new_event_loop())
			
 
				+    try:
			
 
				+        node = DHTNode(node_id, port, initial_peers=peers)
			
 
				+        status_pipe.send('STARTED')
			
 
				+        while True:
			
 
				+            asyncio.get_event_loop().run_forever()
			
 
				+    except BaseException as e:
			
 
				+        status_pipe.send(e)  # report exception to master
			
 
				+        if not isinstance(e, OSError):
			
 
				+            raise e
			
 
				+
			
 
				+
			
 
				+def test_dht():
			
 
				+    # create dht with 50 nodes + your 51-st node
			
 
				+    dht: Dict[Endpoint, DHTID] = {}
			
 
				+    processes: List[mp.Process] = []
			
 
				+    port_fails, max_port_fails = 0, 10
			
 
				+
			
 
				+    while len(dht) < 50:
			
 
				+        node_id = DHTID.generate()
			
 
				+        peers = random.sample(dht.keys(), min(len(dht), 5))
			
 
				+        port = hivemind.find_open_port()
			
 
				+        pipe_recv, pipe_send = mp.Pipe(duplex=False)
			
 
				+        proc = mp.Process(target=run_node, args=(node_id, port, peers, pipe_send), daemon=True)
			
 
				+        proc.start()
			
 
				+
			
 
				+        status = pipe_recv.recv()
			
 
				+        if status == 'STARTED':
			
 
				+            processes.append(proc)
			
 
				+            dht[(LOCALHOST, port)] = node_id
			
 
				+        else:
			
 
				+            assert isinstance(status, BaseException)
			
 
				+            proc.terminate()
			
 
				+            if isinstance(status, OSError):  # port already in use. It just happens sometimes.
			
 
				+                port_fails += 1
			
 
				+                if port_fails > max_port_fails:
			
 
				+                    raise OSError("Too many 'Address already in use' errors.")
			
 
				+            else:
			
 
				+                raise ValueError(f"Failed to create node due to an error {status}, see traceback above")
			
 
				+
			
 
				+    loop = asyncio.get_event_loop()
			
 
				+    me = hivemind.dht.node.DHTNode(initial_peers=random.sample(peers, 5), port=0)  # port=0 means os-specified port
			
 
				+
			
 
				+    # test 1: find self
			
 
				+    nearest = loop.run_until_complete(me.find_nearest_nodes(query_id=me.node_id, k_nearest=1))
			
 
				+    assert len(nearest) == 1 and nearest[me.node_id] == (LOCALHOST, me.port)
			
 
				+
			
 
				+    # test 2: find others
			
 
				+    for i in range(10):
			
 
				+        ref_endpoint, query_id = random.choice(list(dht.items()))
			
 
				+        nearest = loop.run_until_complete(me.find_nearest_nodes(query_id=query_id, k_nearest=1))
			
 
				+        assert len(nearest) == 1 and next(iter(nearest.items())) == (query_id, ref_endpoint)
			
 
				+
			
 
				+    # test 3: find neighbors to random nodes
			
 
				+    accuracy_numerator = accuracy_denominator = 0  # top-1 nearest neighbor accuracy
			
 
				+    jaccard_numerator = jaccard_denominator = 0  # jaccard similarity aka intersection over union
			
 
				+    all_node_ids = list(dht.values())
			
 
				+
			
 
				+    for i in range(100):
			
 
				+        query_id = DHTID.generate()
			
 
				+        k_nearest = random.randint(1, 20)
			
 
				+        exclude_self = random.random() > 0.5
			
 
				+        nearest = loop.run_until_complete(
			
 
				+            me.find_nearest_nodes(query_id=query_id, k_nearest=k_nearest, exclude_self=exclude_self))
			
 
				+        nearest_nodes = list(nearest)  # keys from ordered dict
			
 
				+
			
 
				+        assert len(nearest_nodes) == k_nearest, "beam search must return exactly k_nearest results"
			
 
				+        assert me.node_id not in nearest_nodes or not exclude_self, "if exclude, results should not contain own node id"
			
 
				+        assert np.all(np.diff(query_id.xor_distance(nearest_nodes)) >= 0), "results must be sorted by distance"
			
 
				+
			
 
				+        ref_nearest = heapq.nsmallest(k_nearest + 1, all_node_ids, key=query_id.xor_distance)
			
 
				+        if exclude_self and me.node_id in ref_nearest:
			
 
				+            ref_nearest.remove(me.node_id)
			
 
				+        if len(ref_nearest) > k_nearest:
			
 
				+            ref_nearest.pop()
			
 
				+
			
 
				+        accuracy_numerator += nearest_nodes[0] == ref_nearest[0]
			
 
				+        accuracy_denominator += 1
			
 
				+
			
 
				+        jaccard_numerator += len(set.intersection(set(nearest_nodes), set(ref_nearest)))
			
 
				+        jaccard_denominator += k_nearest
			
 
				+
			
 
				+    accuracy = accuracy_numerator / accuracy_denominator
			
 
				+    print("Top-1 accuracy:", accuracy)  # should be 98-100%
			
 
				+    jaccard_index = jaccard_numerator / jaccard_denominator
			
 
				+    print("Jaccard index (intersection over union):", jaccard_index)  # should be 95-100%
			
 
				+    assert accuracy >= 0.9, f"Top-1 accuracy only {accuracy} ({accuracy_numerator} / {accuracy_denominator})"
			
 
				+    assert jaccard_index >= 0.9, f"Jaccard index only {accuracy} ({accuracy_numerator} / {accuracy_denominator})"
			
 
				+
			
 
				+    # test 4: find all nodes
			
 
				+    nearest = loop.run_until_complete(
			
 
				+        me.find_nearest_nodes(query_id=DHTID.generate(), k_nearest=len(dht) + 100))
			
 
				+    assert len(nearest) == len(dht) + 1
			
 
				+    assert len(set.difference(set(nearest.keys()), set(all_node_ids) | {me.node_id})) == 0
			
 
				+
			
 
				+    # test 5: node without peers
			
 
				+    other_node = hivemind.dht.node.DHTNode()
			
 
				+    nearest = loop.run_until_complete(other_node.find_nearest_nodes(DHTID.generate()))
			
 
				+    assert len(nearest) == 1 and nearest[other_node.node_id] == (LOCALHOST, other_node.port)
			
 
				+    nearest = loop.run_until_complete(other_node.find_nearest_nodes(DHTID.generate(), exclude_self=True))
			
 
				+    assert len(nearest) == 0
			
 
				+
			
 
				+    # test 6 store and get value
			
 
				+    true_time = get_dht_time() + 1200
			
 
				+    assert loop.run_until_complete(me.store("mykey", ["Value", 10], true_time))
			
 
				+    val, expiration_time = loop.run_until_complete(me.get("mykey"))
			
 
				+    assert expiration_time == true_time, "Wrong time"
			
 
				+    assert val == ["Value", 10], "Wrong value"
			
 
				+
			
 
				+    # terminate remaining processes
			
 
				+    for proc in processes:
			
 
				+        proc.terminate()
			
 
				+
			
 
				+
			
 
				+def test_hivemind_dht():
			
 
				+    peers = [hivemind.dht.DHT(start=True)]
			
 
				+    for i in range(10):
			
 
				+        neighbors_i = [('localhost', node.port) for node in random.sample(peers, min(3, len(peers)))]
			
 
				+        peers.append(hivemind.DHT(*neighbors_i, start=True))
			
 
				+
			
 
				+    you: hivemind.dht.DHT = random.choice(peers)
			
 
				+    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)
			
 
				+
			
 
				+    expert_uids = [str(uuid.uuid4()) for _ in range(110)]
			
 
				+    batch_size = 10
			
 
				+    for batch_start in range(0, len(expert_uids), batch_size):
			
 
				+        you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost', 1234)
			
 
				+
			
 
				+    found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
			
 
				+    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
			
 
				+    assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				+
			
 
				+    that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint(1000, 9999)
			
 
				+    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], 'that_host', that_guys_port)
			
 
				+    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
			
 
				+    assert isinstance(you_found, hivemind.RemoteExpert)
			
 
				+    assert you_found.host == 'that_host', you_found.port == that_guys_port
			
 
				+
			
 
				+    # test first_k_active
			
 
				+    assert theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10) == expert_uids[:10]
			
 
				+
			
 
				+    some_permuted_experts = random.sample(expert_uids, k=32)
			
 
				+    assert theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=32) == some_permuted_experts
			
 
				+    assert theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=1) == some_permuted_experts[:1]
			
 
				+    fake_and_real_experts = list(chain(*zip(
			
 
				+        [str(uuid.uuid4()) for _ in some_permuted_experts], some_permuted_experts)))
			
 
				+    assert theguyshetoldyounottoworryabout.first_k_active(fake_and_real_experts, k=9) == some_permuted_experts[:9]
			
 
				+
			
 
				+    for peer in peers:
			
 
				+        peer.shutdown()
			
 
				+
			
 
				+
			
 
				+def test_store():
			
 
				+    d = LocalStorage()
			
 
				+    d.store("key", "val", get_dht_time() + 10)
			
 
				+    assert d.get("key")[0] == "val", "Wrong value"
			
 
				+    print("Test store passed")
			
 
				+
			
 
				+
			
 
				+def test_get_expired():
			
 
				+    d = LocalStorage()
			
 
				+    d.store("key", "val", get_dht_time() + 1)
			
 
				+    time.sleep(2)
			
 
				+    assert d.get("key") == (None, None), "Expired value must be deleted"
			
 
				+    print("Test get expired passed")
			
 
				+
			
 
				+
			
 
				+def test_get_empty():
			
 
				+    d = LocalStorage()
			
 
				+    assert d.get("key") == (None, None), "Expired value must be deleted"
			
 
				+    print("Test get expired passed")
			
 
				+
			
 
				+
			
 
				+def test_change_expiration_time():
			
 
				+    d = LocalStorage()
			
 
				+    d.store("key", "val1", get_dht_time() + 2)
			
 
				+    d.store("key", "val2", get_dht_time() + 200)
			
 
				+    time.sleep(4)
			
 
				+    assert d.get("key")[0] == "val2", "Value must be changed, but still kept in table"
			
 
				+    print("Test change expiration time passed")
			
 
				+
			
 
				+
			
 
				+def test_maxsize_cache():
			
 
				+    d = LocalStorage(maxsize=1)
			
 
				+    d.store("key1", "val1", get_dht_time() + 1)
			
 
				+    d.store("key2", "val2", get_dht_time() + 200)
			
 
				+    assert d.get("key2")[0] == "val2", "Value with bigger exp. time must be kept"
			
 
				+    assert d.get("key1")[0] is None, "Value with less exp time, must be deleted"
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -66,7 +66,7 @@ def test_determinism():
 
				 
			
 
				 def test_compute_expert_scores():
			
 
				     try:
			
 
				-        dht = hivemind.DHTNode(port=hivemind.find_open_port(), start=True)
			
 
				+        dht = hivemind.DHT(port=hivemind.find_open_port(), start=True)
			
 
				         moe = hivemind.client.moe.RemoteMixtureOfExperts(
			
 
				             dht=dht, in_features=1024, grid_size=(40,), k_best=4, k_min=1, timeout_after_k_min=1,
			
 
				             uid_prefix='expert')
			
--- a/tests/test_routing.py
+++ b/tests/test_routing.py
@@ -0,0 +1,114 @@
 
				+import random
			
 
				+import heapq
			
 
				+import operator
			
 
				+from itertools import chain, zip_longest
			
 
				+
			
 
				+from hivemind.dht.routing import RoutingTable, DHTID
			
 
				+from hivemind.utils.serializer import PickleSerializer
			
 
				+
			
 
				+
			
 
				+def test_ids_basic():
			
 
				+    # basic functionality tests
			
 
				+    for i in range(100):
			
 
				+        id1, id2 = DHTID.generate(), DHTID.generate()
			
 
				+        assert DHTID.MIN <= id1 < DHTID.MAX and DHTID.MIN <= id2 <= DHTID.MAX
			
 
				+        assert DHTID.xor_distance(id1, id1) == DHTID.xor_distance(id2, id2) == 0
			
 
				+        assert DHTID.xor_distance(id1, id2) > 0 or (id1 == id2)
			
 
				+        assert len(PickleSerializer.dumps(id1)) - len(PickleSerializer.dumps(int(id1))) < 40
			
 
				+        assert DHTID.from_bytes(bytes(id1)) == id1 and DHTID.from_bytes(id2.to_bytes()) == id2
			
 
				+
			
 
				+
			
 
				+def test_ids_depth():
			
 
				+    for i in range(100):
			
 
				+        ids = [random.randint(0, 4096) for i in range(random.randint(1, 256))]
			
 
				+        ours = DHTID.longest_common_prefix_length(*map(DHTID, ids))
			
 
				+
			
 
				+        ids_bitstr = [
			
 
				+            "".join(bin(bite)[2:].rjust(8, '0') for bite in uid.to_bytes(20, 'big'))
			
 
				+            for uid in ids
			
 
				+        ]
			
 
				+        reference = len(shared_prefix(*ids_bitstr))
			
 
				+        assert reference == ours, f"ours {ours} != reference {reference}, ids: {ids}"
			
 
				+
			
 
				+
			
 
				+def test_routing_table_basic():
			
 
				+    node_id = DHTID.generate()
			
 
				+    routing_table = RoutingTable(node_id, bucket_size=20, depth_modulo=5)
			
 
				+
			
 
				+    for phony_neighbor_port in random.sample(range(10000), 100):
			
 
				+        phony_id = DHTID.generate()
			
 
				+        routing_table.add_or_update_node(phony_id, ('localhost', phony_neighbor_port))
			
 
				+        assert routing_table[phony_id] == ('localhost', phony_neighbor_port)
			
 
				+
			
 
				+    assert routing_table.buckets[0].lower == DHTID.MIN and routing_table.buckets[-1].upper == DHTID.MAX
			
 
				+    for bucket in routing_table.buckets:
			
 
				+        assert len(bucket.replacement_nodes) == 0, "There should be no replacement nodes in a table with 100 entries"
			
 
				+    assert 3 <= len(routing_table.buckets) <= 10, len(routing_table.buckets)
			
 
				+
			
 
				+
			
 
				+def test_routing_table_parameters():
			
 
				+    for (bucket_size, modulo, min_nbuckets, max_nbuckets) in [
			
 
				+        (20,          5,      45,           65),
			
 
				+        (50,          5,      35,           45),
			
 
				+        (20,          10,     650,          800),
			
 
				+        (20,          1,      7,            15),
			
 
				+    ]:
			
 
				+        node_id = DHTID.generate()
			
 
				+        routing_table = RoutingTable(node_id, bucket_size=bucket_size, depth_modulo=modulo)
			
 
				+        for phony_neighbor_port in random.sample(range(1_000_000), 10_000):
			
 
				+            routing_table.add_or_update_node(DHTID.generate(), ('localhost', phony_neighbor_port))
			
 
				+        for bucket in routing_table.buckets:
			
 
				+            assert len(bucket.replacement_nodes) == 0 or len(bucket.nodes_to_addr) <= bucket.size
			
 
				+        assert min_nbuckets <= len(routing_table.buckets) <= max_nbuckets, (
			
 
				+            f"Unexpected number of buckets: {min_nbuckets} <= {len(routing_table.buckets)} <= {max_nbuckets}")
			
 
				+
			
 
				+
			
 
				+def test_routing_table_search():
			
 
				+    for table_size, lower_active, upper_active in [
			
 
				+        (10, 10, 10), (10_000, 800, 1100)
			
 
				+    ]:
			
 
				+        node_id = DHTID.generate()
			
 
				+        routing_table = RoutingTable(node_id, bucket_size=20, depth_modulo=5)
			
 
				+        num_added = 0
			
 
				+        for phony_neighbor_port in random.sample(range(1_000_000), table_size):
			
 
				+            num_added += routing_table.add_or_update_node(DHTID.generate(), ('localhost', phony_neighbor_port)) is None
			
 
				+        num_replacements = sum(len(bucket.replacement_nodes) for bucket in routing_table.buckets)
			
 
				+    
			
 
				+        all_active_neighbors = list(chain(
			
 
				+            *(bucket.nodes_to_addr.keys() for bucket in routing_table.buckets)
			
 
				+        ))
			
 
				+        assert lower_active <= len(all_active_neighbors) <= upper_active
			
 
				+        assert len(all_active_neighbors) == num_added
			
 
				+        assert num_added + num_replacements == table_size
			
 
				+    
			
 
				+        # random queries
			
 
				+        for i in range(500):
			
 
				+            k = random.randint(1, 100)
			
 
				+            query_id = DHTID.generate()
			
 
				+            exclude = query_id if random.random() < 0.5 else None
			
 
				+            our_knn, our_addrs = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=exclude))
			
 
				+            reference_knn = heapq.nsmallest(k, all_active_neighbors, key=query_id.xor_distance)
			
 
				+            assert all(our == ref for our, ref in zip_longest(our_knn, reference_knn))
			
 
				+            assert all(our_addr == routing_table[our_node] for our_node, our_addr in zip(our_knn, our_addrs))
			
 
				+    
			
 
				+        # queries from table
			
 
				+        for i in range(500):
			
 
				+            k = random.randint(1, 100)
			
 
				+            query_id = random.choice(all_active_neighbors)
			
 
				+            our_knn, our_addrs = zip(*routing_table.get_nearest_neighbors(query_id, k=k, exclude=query_id))
			
 
				+            reference_knn = heapq.nsmallest(
			
 
				+                k + 1, all_active_neighbors,
			
 
				+                key=lambda uid: query_id.xor_distance(uid))
			
 
				+            if query_id in reference_knn:
			
 
				+                reference_knn.remove(query_id)
			
 
				+            assert len(our_knn) == len(reference_knn)
			
 
				+            assert all(query_id.xor_distance(our) == query_id.xor_distance(ref)
			
 
				+                       for our, ref in zip_longest(our_knn, reference_knn))
			
 
				+            assert routing_table.get_nearest_neighbors(query_id, k=k, exclude=None)[0][0] == query_id
			
 
				+
			
 
				+
			
 
				+def shared_prefix(*strings: str):
			
 
				+    for i in range(min(map(len, strings))):
			
 
				+        if len(set(map(operator.itemgetter(i), strings))) != 1:
			
 
				+            return strings[0][:i]
			
 
				+    return min(strings, key=len)
			
--- a/tests/test_utils/run_server.py
+++ b/tests/test_utils/run_server.py
@@ -11,7 +11,7 @@ from .layers import name_to_block, name_to_input
 
				 def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls='ffn', hidden_dim=1024,
			
 
				                       num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None,
			
 
				                       no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, root_port=None, verbose=True,
			
 
				-                      UID_DELIMETER=hivemind.DHTNode.UID_DELIMETER, start=False, **kwargs) -> hivemind.Server:
			
 
				+                      UID_DELIMETER=hivemind.DHT.UID_DELIMETER, start=False, **kwargs) -> hivemind.Server:
			
 
				     """
			
 
				     Instantiate a server with several identical experts. See argparse comments below for details
			
 
				     :param interface: 'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6
			
@@ -45,7 +45,7 @@ def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls=
 
				     if not no_dht:
			
 
				         if not len(initial_peers):
			
 
				             print("No initial peers provided. Starting additional dht as an initial peer.")
			
 
				-            dht_root = hivemind.DHTNode(
			
 
				+            dht_root = hivemind.DHT(
			
 
				                 *initial_peers, port=root_port or hivemind.find_open_port(), start=True)
			
 
				             print(f"Initializing DHT with port {dht_root.port}")
			
 
				             initial_peers = (('localhost', dht_root.port),)
			
@@ -54,7 +54,7 @@ def make_dummy_server(interface='0.0.0.0', port=None, num_experts=1, expert_cls=
 
				             if root_port is not None:
			
 
				                 print(f"Warning: root_port={root_port} will not be used since we already have peers.")
			
 
				 
			
 
				-        dht = hivemind.DHTNode(
			
 
				+        dht = hivemind.DHT(
			
 
				             *initial_peers, port=dht_port or hivemind.find_open_port(), start=True)
			
 
				         if verbose:
			
 
				             print(f"Running dht node on port {dht.port}")