5 yıl önce · 9c1e14aca1
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -9,6 +9,7 @@ jobs:
 
				     steps:
			
 
				       - checkout
			
 
				       - python/load-cache
			
 
				+      - run: pip uninstall -y pytest codecov  # temporary override for broken cache
			
 
				       - run: pip install codecov pytest tqdm scikit-learn
			
 
				       - python/install-deps
			
 
				       - python/save-cache
			
--- a/docs/user/contributing.md
+++ b/docs/user/contributing.md
@@ -1,4 +1,4 @@
 
				-## Contributing
			
 
				+## Developer zone
			
 
				 
			
 
				 #### Collaborating best practices:
			
 
				 Hivemind is still in the early stage of development, we expect only a handful of collaborators with individual roles.
			
@@ -19,7 +19,7 @@ Hivemind is still in the early stage of development, we expect only a handful of
 
				    * If you face any challenges or want feedback, please submit a [draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/) pull request.
			
 
				 
			
 
				 
			
 
				-#### Contributor's manual
			
 
				+#### Developer quickstart
			
 
				 
			
 
				 First, install hivemind in the development mode, preferably with python 3.8 on linux/mac_OS.
			
 
				 ```
			
@@ -98,23 +98,24 @@ to measure performance impact of changes to hivemind.dht. It spawns a DHT with `
 
				 then chooses one peer that will declare `num_experts` total experts in batches of `expert_batch_size`.
			
 
				 Then, another peer will consecutively get all peers and check if they are there.
			
 
				 
			
 
				-Here's a run with 1024 participants on the same machine that was used benchmark_throughput:
			
 
				+Here's a run with 1024 participants on the same machine that was used for benchmark_throughput:
			
 
				 
			
 
				+`python benchmark_dht.py --num_peers 1024 --num_experts 16384 --expert_batch_size 64 --expiration 99999 --increase_file_limit`
			
 
				 <details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				   <summary>Console outputs</summary>
			
 
				   
			
 
				   ```sh
			
 
				 Increasing file limit - soft 1024=>32768, hard 1048576=>32768
			
 
				 Creating peers...
			
 
				-100%|███████████████████████████████████████████████████| 1024/1024 [01:51<00:00,  9.22it/s]
			
 
				+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [01:45<00:00,  9.74it/s]
			
 
				 Sampled 16384 unique ids (after deduplication)
			
 
				 Storing peers to dht in batches of 64...
			
 
				-100%|█████████████████████████████████████████████████████| 256/256 [13:00<00:00,  3.05s/it]
			
 
				-Store success rate: 100.0% (48904 / 48904)
			
 
				-Mean store time: 0.015967, Total: 780.85
			
 
				-100%|█████████████████████████████████████████████████████| 256/256 [02:01<00:00,  2.11it/s]
			
 
				-Get success rate: 100.0 (16383 / 16384)
			
 
				-Mean get time: 0.00740, Total: 121.29011
			
 
				+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [12:07<00:00,  2.84s/it]
			
 
				+Store success rate: 100.0% (48920 / 48920)
			
 
				+Mean store time: 0.01487, Total: 727.46
			
 
				+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [01:48<00:00,  2.35it/s]
			
 
				+Get success rate: 100.0 (16384 / 16384)
			
 
				+Mean get time: 0.00664, Total: 108.73952
			
 
				 Node survival rate: 100.000%
			
 
				   ```
			
 
				 </details>
			
@@ -125,6 +126,6 @@ If one wants to account for these factors, one must introduce them manually by c
 
				   
			
 
				 
			
 
				 #### Tips & tricks
			
 
				-* You can find a wealth of pytorch debugging tricks at [their contributing page](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
			
 
				+* You can find a wealth of pytorch debugging tricks at [their contributing page](https://tinyurl.com/pytorch-contributing).
			
 
				 * Hivemind is optimized for development in pycharm CE 2019.3 or newer.
			
 
				   * When working on tests, please mark "tests" as sources root.
			
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
				 from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				 
			
 
				-__version__ = '0.8.2'
			
 
				+__version__ = '0.8.3'
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -25,7 +25,9 @@ import uvloop
 
				 from hivemind.client import RemoteExpert
			
 
				 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
			
 
				 from hivemind.dht.routing import get_dht_time
			
 
				-from hivemind.utils import MPFuture, Endpoint
			
 
				+from hivemind.utils import MPFuture, Endpoint, get_logger
			
 
				+
			
 
				+logger = get_logger(__name__)
			
 
				 
			
 
				 
			
 
				 class DHT(mp.Process):
			
@@ -155,7 +157,7 @@ class DHT(mp.Process):
 
				         :param uids: a list of expert ids to update
			
 
				         :param endpoint: endpoint that serves these experts, usually your server endpoint (e.g. "201.111.222.333:1337")
			
 
				         :param wait: if True, awaits for declaration to finish, otherwise runs in background
			
 
				-        :param timeout: waits for the procedure to finish, None means wait indeninitely
			
 
				+        :param timeout: waits for the procedure to finish for up to this long, None means wait indefinitely
			
 
				         :returns: if wait, returns a list of booleans, (True = store succeeded, False = store rejected)
			
 
				         """
			
 
				         assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -1,15 +1,21 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				 import asyncio
			
 
				+
			
 
				 import random
			
 
				-from collections import namedtuple
			
 
				-from typing import Optional, Tuple, List, Dict, Collection, Union, Set
			
 
				+from collections import defaultdict
			
 
				+from dataclasses import dataclass, field
			
 
				+from typing import Optional, Tuple, List, Dict, DefaultDict, Collection, Union, Set, Awaitable, Callable, Any, Iterable
			
 
				+from sortedcontainers import SortedList
			
 
				+from functools import partial
			
 
				 from warnings import warn
			
 
				 
			
 
				-from hivemind.dht.protocol import DHTProtocol
			
 
				-from hivemind.dht.routing import DHTID, DHTExpiration, DHTKey, get_dht_time, DHTValue
			
 
				+from hivemind.dht.protocol import DHTProtocol, LocalStorage
			
 
				+from hivemind.dht.routing import DHTID, DHTExpiration, DHTKey, get_dht_time, DHTValue, BinaryDHTValue
			
 
				 from hivemind.dht.traverse import traverse_dht
			
 
				-from hivemind.utils import Endpoint, LOCALHOST, MSGPackSerializer
			
 
				+from hivemind.utils import Endpoint, LOCALHOST, MSGPackSerializer, get_logger, SerializerBase
			
 
				+
			
 
				+logger = get_logger(__name__)
			
 
				 
			
 
				 
			
 
				 class DHTNode:
			
@@ -45,8 +51,10 @@ class DHTNode:
 
				 
			
 
				     """
			
 
				     # fmt:off
			
 
				-    node_id: DHTID; port: int; num_replicas: int; cache_locally: bool; cache_nearest: int; num_workers: int
			
 
				-    refresh_timeout: float; protocol: DHTProtocol
			
 
				+    node_id: DHTID; is_alive: bool; port: int; num_replicas: int; num_workers: int; protocol: DHTProtocol
			
 
				+    refresh_timeout: float; cache_locally: bool; cache_nearest: int; cache_refresh_before_expiry: float
			
 
				+    cache_refresh_available: asyncio.Event; cache_refresh_queue: LocalStorage
			
 
				+    reuse_get_requests: bool; pending_get_requests: DefaultDict[DHTID, SortedList[_IntermediateResult]]
			
 
				     serializer = MSGPackSerializer  # used to pack/unpack DHT Values for transfer over network
			
 
				     # fmt:on
			
 
				 
			
@@ -55,8 +63,9 @@ class DHTNode:
 
				             cls, node_id: Optional[DHTID] = None, initial_peers: List[Endpoint] = (),
			
 
				             bucket_size: int = 20, num_replicas: int = 5, depth_modulo: int = 5, parallel_rpc: int = None,
			
 
				             wait_timeout: float = 5, refresh_timeout: Optional[float] = None, bootstrap_timeout: Optional[float] = None,
			
 
				-            num_workers: int = 1, cache_locally: bool = True, cache_nearest: int = 1, cache_size=None,
			
 
				-            listen: bool = True, listen_on: Endpoint = "0.0.0.0:*", **kwargs) -> DHTNode:
			
 
				+            cache_locally: bool = True, cache_nearest: int = 1, cache_size=None, cache_refresh_before_expiry: float = 5,
			
 
				+            reuse_get_requests: bool = True, num_workers: int = 1, listen: bool = True,
			
 
				+            listen_on: Endpoint = "0.0.0.0:*", **kwargs) -> DHTNode:
			
 
				         """
			
 
				         :param node_id: current node's identifier, determines which keys it will store locally, defaults to random id
			
 
				         :param initial_peers: connects to these peers to populate routing table, defaults to no peers
			
@@ -71,11 +80,15 @@ class DHTNode:
 
				         :param refresh_timeout: refresh buckets if no node from that bucket was updated in this many seconds
			
 
				           if staleness_timeout is None, DHTNode will not refresh stale buckets (which is usually okay)
			
 
				         :param bootstrap_timeout: after one of peers responds, await other peers for at most this many seconds
			
 
				-        :param num_workers: concurrent workers in traverse_dht (see traverse_dht num_workers param)
			
 
				         :param cache_locally: if True, caches all values (stored or found) in a node-local cache
			
 
				         :param cache_nearest: whenever DHTNode finds a value, it will also store (cache) this value on this many
			
 
				           nodes nearest nodes visited by search algorithm. Prefers nodes that are nearest to :key: but have no value yet
			
 
				         :param cache_size: if specified, local cache will store up to this many records (as in LRU cache)
			
 
				+        :param cache_refresh_before_expiry: if nonzero, refreshes locally cached values
			
 
				+          if they are accessed this many seconds before expiration time.
			
 
				+        :param reuse_get_requests: if True, DHTNode allows only one traverse_dht procedure for every key
			
 
				+          all concurrent get requests for the same key will reuse the procedure that is currently in progress
			
 
				+        :param num_workers: concurrent workers in traverse_dht (see traverse_dht num_workers param)
			
 
				         :param listen: if True (default), this node will accept incoming request and otherwise be a DHT "citzen"
			
 
				           if False, this node will refuse any incoming request, effectively being only a "client"
			
 
				         :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
			
@@ -83,11 +96,26 @@ class DHTNode:
 
				           see https://grpc.github.io/grpc/core/group__grpc__arg__keys.html for a list of all options
			
 
				         :param kwargs: extra parameters used in grpc.aio.server
			
 
				         """
			
 
				+        if cache_refresh_before_expiry > 0 and not cache_locally:
			
 
				+            logger.warning("If cache_locally is False, cache_refresh_before_expiry has no effect. To silence this"
			
 
				+                           " warning, please specify cache_refresh_before_expiry=0")
			
 
				+
			
 
				         self = cls(_initialized_with_create=True)
			
 
				         self.node_id = node_id = node_id if node_id is not None else DHTID.generate()
			
 
				         self.num_replicas, self.num_workers = num_replicas, num_workers
			
 
				-        self.cache_locally, self.cache_nearest = cache_locally, cache_nearest
			
 
				+        self.is_alive = True  # if set to False, cancels all background jobs such as routing table refresh
			
 
				+
			
 
				+        self.reuse_get_requests = reuse_get_requests
			
 
				+        self.pending_get_requests = defaultdict(partial(SortedList, key=lambda _res: - _res.sufficient_expiration_time))
			
 
				+
			
 
				+        # caching policy
			
 
				         self.refresh_timeout = refresh_timeout
			
 
				+        self.cache_locally, self.cache_nearest = cache_locally, cache_nearest
			
 
				+        self.cache_refresh_before_expiry = cache_refresh_before_expiry
			
 
				+        self.cache_refresh_queue = LocalStorage()
			
 
				+        self.cache_refresh_available = asyncio.Event()
			
 
				+        if cache_refresh_before_expiry:
			
 
				+            asyncio.create_task(self._refresh_stale_cache_entries())
			
 
				 
			
 
				         self.protocol = await DHTProtocol.create(self.node_id, bucket_size, depth_modulo, num_replicas, wait_timeout,
			
 
				                                                  parallel_rpc, cache_size, listen, listen_on, **kwargs)
			
@@ -129,7 +157,9 @@ class DHTNode:
 
				 
			
 
				     async def shutdown(self, timeout=None):
			
 
				         """ Process existing requests, close all connections and stop the server """
			
 
				-        await self.protocol.shutdown(timeout)
			
 
				+        self.is_alive = False
			
 
				+        if self.protocol.server:
			
 
				+            await self.protocol.shutdown(timeout)
			
 
				 
			
 
				     async def find_nearest_nodes(
			
 
				             self, queries: Collection[DHTID], k_nearest: Optional[int] = None, beam_size: Optional[int] = None,
			
@@ -157,15 +187,15 @@ class DHTNode:
 
				                 node_to_endpoint.update(
			
 
				                     self.protocol.routing_table.get_nearest_neighbors(query, beam_size, exclude=self.node_id))
			
 
				 
			
 
				-        async def get_neighbors(peer: DHTID, queries: Collection[DHTID]) -> Dict[DHTID, Tuple[List[DHTID], bool]]:
			
 
				+        async def get_neighbors(peer: DHTID, queries: Collection[DHTID]) -> Dict[DHTID, Tuple[Tuple[DHTID], bool]]:
			
 
				             response = await self.protocol.call_find(node_to_endpoint[peer], queries)
			
 
				             if not response:
			
 
				                 return {query: ([], False) for query in queries}
			
 
				 
			
 
				-            output: Dict[DHTID, Tuple[List[DHTID], bool]] = {}
			
 
				+            output: Dict[DHTID, Tuple[Tuple[DHTID], bool]] = {}
			
 
				             for query, (_, _, peers) in response.items():
			
 
				                 node_to_endpoint.update(peers)
			
 
				-                output[query] = list(peers.keys()), False  # False means "do not interrupt search"
			
 
				+                output[query] = tuple(peers.keys()), False  # False means "do not interrupt search"
			
 
				             return output
			
 
				 
			
 
				         nearest_nodes_per_query, visited_nodes = await traverse_dht(
			
@@ -289,7 +319,7 @@ class DHTNode:
 
				         Search for a key across DHT and return either first or latest entry.
			
 
				         :param key: same key as in node.store(...)
			
 
				         :param latest: if True, finds the latest value, otherwise finds any non-expired value (which is much faster)
			
 
				-        :param kwargs: parameters forwarded to get_many
			
 
				+        :param kwargs: parameters forwarded to get_many_by_id
			
 
				         :returns: (value, expiration time); if value was not found, returns (None, None)
			
 
				         """
			
 
				         if latest:
			
@@ -297,100 +327,190 @@ class DHTNode:
 
				         result = await self.get_many([key])
			
 
				         return result[key]
			
 
				 
			
 
				-    async def get_many(
			
 
				-            self, keys: Collection[DHTKey], sufficient_expiration_time: Optional[DHTExpiration] = None,
			
 
				-            num_workers: Optional[int] = None, beam_size: Optional[int] = None
			
 
				-    ) -> Dict[DHTKey, Tuple[Optional[DHTValue], Optional[DHTExpiration]]]:
			
 
				+    async def get_many(self, keys: Collection[DHTKey], sufficient_expiration_time: Optional[DHTExpiration] = None,
			
 
				+                       **kwargs) -> Dict[DHTKey, Union[Tuple[Optional[DHTValue], Optional[DHTExpiration]],
			
 
				+                                                       Awaitable[Tuple[Optional[DHTValue], Optional[DHTExpiration]]]]]:
			
 
				         """
			
 
				+        Traverse DHT to find a list of keys. For each key, return latest (value, expiration) or None if not found.
			
 
				+
			
 
				         :param keys: traverse the DHT and find the value for each of these keys (or (None, None) if not key found)
			
 
				+        :param sufficient_expiration_time: if the search finds a value that expires after this time,
			
 
				+            default = time of call, find any value that did not expire by the time of call
			
 
				+            If min_expiration_time=float('inf'), this method will find a value with _latest_ expiration
			
 
				+        :param kwargs: for full list of parameters, see DHTNode.get_many_by_id
			
 
				+        :returns: for each key: value and its expiration time. If nothing is found, returns (None, None) for that key
			
 
				+        :note: in order to check if get returned a value, please check if (expiration_time is None)
			
 
				+        """
			
 
				+        keys = tuple(keys)
			
 
				+        key_ids = [DHTID.generate(key) for key in keys]
			
 
				+        id_to_original_key = dict(zip(key_ids, keys))
			
 
				+        results_by_id = await self.get_many_by_id(key_ids, sufficient_expiration_time, **kwargs)
			
 
				+        return {id_to_original_key[key]: result_or_future for key, result_or_future in results_by_id.items()}
			
 
				+
			
 
				+    async def get_many_by_id(
			
 
				+            self, key_ids: Collection[DHTID], sufficient_expiration_time: Optional[DHTExpiration] = None,
			
 
				+            num_workers: Optional[int] = None, beam_size: Optional[int] = None, return_futures: bool = False,
			
 
				+            _refresh_cache=True) -> Dict[DHTID, Union[Tuple[Optional[DHTValue], Optional[DHTExpiration]],
			
 
				+                                                      Awaitable[Tuple[Optional[DHTValue], Optional[DHTExpiration]]]]]:
			
 
				+        """
			
 
				+        Traverse DHT to find a list of DHTIDs. For each key, return latest (value, expiration) or None if not found.
			
 
				+
			
 
				+        :param key_ids: traverse the DHT and find the value for each of these keys (or (None, None) if not key found)
			
 
				         :param sufficient_expiration_time: if the search finds a value that expires after this time,
			
 
				             default = time of call, find any value that did not expire by the time of call
			
 
				             If min_expiration_time=float('inf'), this method will find a value with _latest_ expiration
			
 
				         :param beam_size: maintains up to this many nearest nodes when crawling dht, default beam_size = bucket_size
			
 
				         :param num_workers: override for default num_workers, see traverse_dht num_workers param
			
 
				-        :returns: for each key: value and its expiration time. If nothing is found , returns (None, None) for that key
			
 
				+        :param return_futures: if True, immediately return asyncio.Future for every before interacting with the nework.
			
 
				+         The algorithm will populate these futures with (value, expiration) when it finds the corresponding key
			
 
				+         Note: canceling a future will stop search for the corresponding key
			
 
				+        :param _refresh_cache: internal flag, whether or not to self._trigger_cache_refresh
			
 
				+        :returns: for each key: value and its expiration time. If nothing is found, returns (None, None) for that key
			
 
				         :note: in order to check if get returned a value, please check (expiration_time is None)
			
 
				         """
			
 
				-        key_ids = [DHTID.generate(key) for key in keys]
			
 
				-        id_to_original_key = dict(zip(key_ids, keys))
			
 
				         sufficient_expiration_time = sufficient_expiration_time or get_dht_time()
			
 
				         beam_size = beam_size if beam_size is not None else self.protocol.bucket_size
			
 
				         num_workers = num_workers if num_workers is not None else self.num_workers
			
 
				+        search_results: Dict[DHTID, _IntermediateResult] = {key_id: _IntermediateResult(
			
 
				+            key_id, sufficient_expiration_time, serializer=self.serializer) for key_id in key_ids}
			
 
				 
			
 
				-        # search metadata
			
 
				-        unfinished_key_ids = set(key_ids)  # track key ids for which the search is not terminated
			
 
				-        node_to_endpoint: Dict[DHTID, Endpoint] = dict()  # global routing table for all queries
			
 
				+        if _refresh_cache:
			
 
				+            for key_id in key_ids:
			
 
				+                search_results[key_id].add_done_callback(self._trigger_cache_refresh)
			
 
				 
			
 
				-        SearchResult = namedtuple("SearchResult", ["binary_value", "expiration_time", "source_node_id"])
			
 
				-        latest_results = {key_id: SearchResult(b'', -float('inf'), None) for key_id in key_ids}
			
 
				+        # if we have concurrent get request for some of the same keys, subscribe to their results
			
 
				+        if self.reuse_get_requests:
			
 
				+            for key_id, search_result in search_results.items():
			
 
				+                self.pending_get_requests[key_id].add(search_result)
			
 
				+                search_result.add_done_callback(self._reuse_finished_search_result)
			
 
				 
			
 
				-        # stage 1: value can be stored in our local cache
			
 
				+        # stage 1: check for value in this node's local storage and cache
			
 
				         for key_id in key_ids:
			
 
				-            maybe_value, maybe_expiration_time = self.protocol.storage.get(key_id)
			
 
				-            if maybe_expiration_time is None:
			
 
				-                maybe_value, maybe_expiration_time = self.protocol.cache.get(key_id)
			
 
				-            if maybe_expiration_time is not None and maybe_expiration_time > latest_results[key_id].expiration_time:
			
 
				-                latest_results[key_id] = SearchResult(maybe_value, maybe_expiration_time, self.node_id)
			
 
				-                if maybe_expiration_time >= sufficient_expiration_time:
			
 
				-                    unfinished_key_ids.remove(key_id)
			
 
				-
			
 
				-        # stage 2: traverse the DHT for any unfinished keys
			
 
				+            search_results[key_id].add_candidate(*self.protocol.storage.get(key_id), source_node_id=self.node_id)
			
 
				+            search_results[key_id].add_candidate(*self.protocol.cache.get(key_id), source_node_id=self.node_id)
			
 
				+
			
 
				+        # stage 2: traverse the DHT to get the remaining keys from remote peers
			
 
				+        unfinished_key_ids = [key_id for key_id in key_ids if not search_results[key_id].finished]
			
 
				+        node_to_endpoint: Dict[DHTID, Endpoint] = dict()  # global routing table for all keys
			
 
				         for key_id in unfinished_key_ids:
			
 
				             node_to_endpoint.update(self.protocol.routing_table.get_nearest_neighbors(
			
 
				                 key_id, self.protocol.bucket_size, exclude=self.node_id))
			
 
				 
			
 
				-        async def get_neighbors(peer: DHTID, queries: Collection[DHTID]) -> Dict[DHTID, Tuple[List[DHTID], bool]]:
			
 
				+        # V-- this function will be called every time traverse_dht decides to request neighbors from a remote peer
			
 
				+        async def get_neighbors(peer: DHTID, queries: Collection[DHTID]) -> Dict[DHTID, Tuple[Tuple[DHTID], bool]]:
			
 
				             queries = list(queries)
			
 
				             response = await self.protocol.call_find(node_to_endpoint[peer], queries)
			
 
				             if not response:
			
 
				                 return {query: ([], False) for query in queries}
			
 
				 
			
 
				-            output: Dict[DHTID, Tuple[List[DHTID], bool]] = {}
			
 
				-            for key_id, (maybe_value, maybe_expiration_time, peers) in response.items():
			
 
				+            output: Dict[DHTID, Tuple[Tuple[DHTID], bool]] = {}
			
 
				+            for key_id, (maybe_value_bytes, maybe_expiration_time, peers) in response.items():
			
 
				                 node_to_endpoint.update(peers)
			
 
				-                if maybe_expiration_time is not None and maybe_expiration_time > latest_results[key_id].expiration_time:
			
 
				-                    latest_results[key_id] = SearchResult(maybe_value, maybe_expiration_time, peer)
			
 
				-                should_interrupt = (latest_results[key_id].expiration_time >= sufficient_expiration_time)
			
 
				-                output[key_id] = list(peers.keys()), should_interrupt
			
 
				+                search_results[key_id].add_candidate(maybe_value_bytes, maybe_expiration_time, source_node_id=peer)
			
 
				+                output[key_id] = tuple(peers.keys()), search_results[key_id].finished
			
 
				+                # note: we interrupt search either if key is either found or finished otherwise (e.g. cancelled by user)
			
 
				             return output
			
 
				 
			
 
				-        nearest_nodes_per_query, visited_nodes = await traverse_dht(
			
 
				+        # V-- this function will be called exactly once when traverse_dht finishes search for a given key
			
 
				+        async def found_callback(key_id: DHTID, nearest_nodes: List[DHTID], _visited: Set[DHTID]):
			
 
				+            search_results[key_id].finish_search()  # finish search whether or we found something
			
 
				+            self._cache_new_result(search_results[key_id], nearest_nodes, node_to_endpoint)
			
 
				+
			
 
				+        asyncio.create_task(traverse_dht(
			
 
				             queries=list(unfinished_key_ids), initial_nodes=list(node_to_endpoint),
			
 
				             beam_size=beam_size, num_workers=num_workers, queries_per_call=int(len(unfinished_key_ids) ** 0.5),
			
 
				-            get_neighbors=get_neighbors, visited_nodes={key_id: {self.node_id} for key_id in unfinished_key_ids})
			
 
				-
			
 
				-        # stage 3: cache any new results depending on caching parameters
			
 
				-        for key_id, nearest_nodes in nearest_nodes_per_query.items():
			
 
				-            latest_value_bytes, latest_expiration_time, latest_node_id = latest_results[key_id]
			
 
				-            should_cache = latest_expiration_time >= sufficient_expiration_time  # if we found a newer value, cache it
			
 
				-            if should_cache and self.cache_locally:
			
 
				-                self.protocol.cache.store(key_id, latest_value_bytes, latest_expiration_time)
			
 
				-
			
 
				-            if should_cache and self.cache_nearest:
			
 
				-                num_cached_nodes = 0
			
 
				-                for node_id in nearest_nodes:
			
 
				-                    if node_id == latest_node_id:
			
 
				-                        continue
			
 
				-                    asyncio.create_task(self.protocol.call_store(
			
 
				-                        node_to_endpoint[node_id], [key_id], [latest_value_bytes], [latest_expiration_time],
			
 
				-                        in_cache=True))
			
 
				-                    num_cached_nodes += 1
			
 
				-                    if num_cached_nodes >= self.cache_nearest:
			
 
				-                        break
			
 
				-
			
 
				-        # stage 4: deserialize data and assemble function output
			
 
				-        find_result: Dict[DHTKey, Tuple[Optional[DHTValue], Optional[DHTExpiration]]] = {}
			
 
				-        for key_id, (latest_value_bytes, latest_expiration_time, _) in latest_results.items():
			
 
				-            if latest_expiration_time != -float('inf'):
			
 
				-                latest_value = self.serializer.loads(latest_value_bytes)
			
 
				-                find_result[id_to_original_key[key_id]] = (latest_value, latest_expiration_time)
			
 
				-            else:
			
 
				-                find_result[id_to_original_key[key_id]] = None, None
			
 
				-        return find_result
			
 
				+            get_neighbors=get_neighbors, visited_nodes={key_id: {self.node_id} for key_id in unfinished_key_ids},
			
 
				+            found_callback=found_callback, await_all_tasks=False))
			
 
				+
			
 
				+        if return_futures:
			
 
				+            return {key_id: search_result.future for key_id, search_result in search_results.items()}
			
 
				+        else:
			
 
				+            try:
			
 
				+                # note: this should be first time when we await something, there's no need to "try" the entire function
			
 
				+                return {key_id: await search_result.future for key_id, search_result in search_results.items()}
			
 
				+            except asyncio.CancelledError as e:  # terminate remaining tasks ASAP
			
 
				+                for key_id, search_result in search_results.items():
			
 
				+                    search_result.future.cancel()
			
 
				+                raise e
			
 
				+
			
 
				+    def _reuse_finished_search_result(self, finished: _IntermediateResult):
			
 
				+        expiration_time_threshold = max(finished.expiration_time or -float('inf'), finished.sufficient_expiration_time)
			
 
				+        concurrent_requests: SortedList[_IntermediateResult] = self.pending_get_requests[finished.key_id]
			
 
				+        # note: concurrent_requests is sorded in the order of descending sufficient_expiration_time
			
 
				+        while concurrent_requests and expiration_time_threshold >= concurrent_requests[-1].sufficient_expiration_time:
			
 
				+            concurrent_requests[-1].add_candidate(finished.binary_value, finished.expiration_time,
			
 
				+                                                  source_node_id=finished.source_node_id)
			
 
				+            concurrent_requests[-1].finish_search()
			
 
				+            concurrent_requests.pop(-1)
			
 
				+
			
 
				+    def _trigger_cache_refresh(self, result: _IntermediateResult):
			
 
				+        """ Called after get request is finished (whether it was found, not found, hit cache, cancelled, or reused) """
			
 
				+        if result.found_something and result.source_node_id == self.node_id:
			
 
				+            with self.protocol.cache.freeze():  # do not clear outdated cache for now...
			
 
				+                if self.cache_refresh_before_expiry and result.key_id in self.protocol.cache:
			
 
				+                    previous_earliest_item: Tuple[DHTID, BinaryDHTValue, DHTExpiration] = self.cache_refresh_queue.top()
			
 
				+                    self.cache_refresh_queue.store(result.key_id, result.binary_value, result.expiration_time)
			
 
				+                    if previous_earliest_item is None or result.expiration_time < previous_earliest_item[-1]:
			
 
				+                        self.cache_refresh_available.set()  # if we new element is now earliest, notify the cache queue
			
 
				+
			
 
				+    async def _refresh_stale_cache_entries(self):
			
 
				+        """ periodically refresh keys near-expired keys that were accessed at least once during previous lifetime """
			
 
				+        while self.is_alive:
			
 
				+            with self.cache_refresh_queue.freeze():
			
 
				+                while len(self.cache_refresh_queue) == 0:
			
 
				+                    await self.cache_refresh_available.wait()
			
 
				+                    self.cache_refresh_available.clear()
			
 
				+                key_id, _, nearest_expiration = self.cache_refresh_queue.top()
			
 
				+
			
 
				+            try:
			
 
				+                # step 1: await until :cache_refresh_before_expiry: seconds before earliest first element expires
			
 
				+                time_to_wait = nearest_expiration - get_dht_time() - self.cache_refresh_before_expiry
			
 
				+                await asyncio.wait_for(self.cache_refresh_available.wait(), timeout=time_to_wait)
			
 
				+                # note: the line above will cause TimeoutError when we are ready to refresh cache
			
 
				+                self.cache_refresh_available.clear()  # no timeout error => someone added new entry to queue and ...
			
 
				+                continue  # ... and this element is earlier than nearest_expiration. we should refresh this entry first
			
 
				+
			
 
				+            except asyncio.TimeoutError:  # caught TimeoutError => it is time to refresh the most recent cached entry
			
 
				+                # step 2: find all keys that we should already refresh and remove them from queue
			
 
				+                with self.cache_refresh_queue.freeze():
			
 
				+                    keys_to_refresh = {key_id}
			
 
				+                    del self.cache_refresh_queue[key_id]  # we pledge to refresh this key_id in the nearest batch
			
 
				+                    while self.cache_refresh_queue:
			
 
				+                        key_id, _, nearest_expiration = self.cache_refresh_queue.top()
			
 
				+                        if nearest_expiration > get_dht_time() + self.cache_refresh_before_expiry:
			
 
				+                            break
			
 
				+                        del self.cache_refresh_queue[key_id]  # we pledge to refresh this key_id in the nearest batch
			
 
				+                        keys_to_refresh.add(key_id)
			
 
				+
			
 
				+                # step 3: search newer versions of these keys, cache them as a side-effect of self.get_many_by_id
			
 
				+                await self.get_many_by_id(
			
 
				+                    keys_to_refresh, sufficient_expiration_time=nearest_expiration + self.cache_refresh_before_expiry,
			
 
				+                    _refresh_cache=False)  # if we found value locally, we shouldn't trigger another refresh
			
 
				+
			
 
				+    def _cache_new_result(self, result: _IntermediateResult, nearest_nodes: List[DHTID],
			
 
				+                          node_to_endpoint: Dict[DHTID, Endpoint]):
			
 
				+        """ after key_id is found, update cache according to caching policy. used internally in get and get_many """
			
 
				+        if result.found_something:
			
 
				+            previous_expiration_time = max(self.protocol.storage.get(result.key_id)[1] or -float('inf'),
			
 
				+                                           self.protocol.cache.get(result.key_id)[1] or -float('inf'))
			
 
				+            if result.expiration_time > previous_expiration_time:  # if this value has better expiration
			
 
				+                if self.cache_locally:
			
 
				+                    self.protocol.cache.store(result.key_id, result.binary_value, result.expiration_time)
			
 
				+                if self.cache_nearest:
			
 
				+                    num_cached_nodes = 0
			
 
				+                    for node_id in nearest_nodes:
			
 
				+                        if node_id == result.source_node_id:
			
 
				+                            continue
			
 
				+                        asyncio.create_task(self.protocol.call_store(
			
 
				+                            node_to_endpoint[node_id], [result.key_id], [result.binary_value], [result.expiration_time],
			
 
				+                            in_cache=True))
			
 
				+                        num_cached_nodes += 1
			
 
				+                        if num_cached_nodes >= self.cache_nearest:
			
 
				+                            break
			
 
				 
			
 
				     async def _refresh_routing_table(self, *, period: Optional[float]) -> None:
			
 
				         """ Tries to find new nodes for buckets that were unused for more than self.staleness_timeout """
			
 
				-        while period is not None:  # if None run once, otherwise run forever
			
 
				+        while self.is_alive and period is not None:  # if None run once, otherwise run forever
			
 
				             refresh_time = get_dht_time()
			
 
				             staleness_threshold = refresh_time - period
			
 
				             stale_buckets = [bucket for bucket in self.protocol.routing_table.buckets
			
@@ -400,3 +520,45 @@ class DHTNode:
 
				                 await self.find_nearest_nodes(refresh_id)
			
 
				 
			
 
				             await asyncio.sleep(max(0.0, period - (get_dht_time() - refresh_time)))
			
 
				+
			
 
				+
			
 
				+@dataclass(init=True, repr=True, frozen=False, order=False)
			
 
				+class _IntermediateResult:
			
 
				+    """ A helper class that stores current-best GET results with metadata """
			
 
				+    key_id: DHTID
			
 
				+    sufficient_expiration_time: DHTExpiration
			
 
				+    binary_value: Optional[BinaryDHTValue] = None
			
 
				+    expiration_time: Optional[DHTExpiration] = None  # best expiration time so far
			
 
				+    source_node_id: Optional[DHTID] = None  # node that gave us the value
			
 
				+    future: asyncio.Future[Tuple[Optional[DHTValue], Optional[DHTExpiration]]] = field(default_factory=asyncio.Future)
			
 
				+    serializer: type(SerializerBase) = MSGPackSerializer
			
 
				+
			
 
				+    def add_candidate(self, binary_value: Optional[BinaryDHTValue], expiration_time: Optional[DHTExpiration],
			
 
				+                      source_node_id: Optional[DHTID]):
			
 
				+        if not self.finished and (expiration_time or -float('inf')) > (self.expiration_time or -float('inf')):
			
 
				+            self.binary_value, self.expiration_time, self.source_node_id = binary_value, expiration_time, source_node_id
			
 
				+            if self.expiration_time >= self.sufficient_expiration_time:
			
 
				+                self.finish_search()
			
 
				+
			
 
				+    def add_done_callback(self, callback: Callable[[_IntermediateResult], Any]):
			
 
				+        """ Add callback that will be called when _IntermediateSearchResult is done (found OR cancelled by user) """
			
 
				+        self.future.add_done_callback(lambda _future: callback(self))
			
 
				+
			
 
				+    def finish_search(self):
			
 
				+        if self.future.done():
			
 
				+            return  # either user cancelled our result or someone sent it before us. Nothing more to do here.
			
 
				+        deserialized_value = self.serializer.loads(self.binary_value) if self.found_something else None
			
 
				+        self.future.set_result((deserialized_value, self.expiration_time))
			
 
				+
			
 
				+    @property
			
 
				+    def found_something(self) -> bool:
			
 
				+        """ Whether or not we have at least some result, regardless of its expiration time """
			
 
				+        return self.expiration_time is not None
			
 
				+
			
 
				+    @property
			
 
				+    def finished(self) -> bool:
			
 
				+        return self.future.done()
			
 
				+
			
 
				+    def __lt__(self, other: _IntermediateResult):
			
 
				+        """ _IntermediateResult instances will be sorted by their target expiration time """
			
 
				+        return self.sufficient_expiration_time < other.sufficient_expiration_time
			
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
				 
			
 
				 import asyncio
			
 
				 import heapq
			
 
				+from contextlib import contextmanager
			
 
				 from typing import Optional, List, Tuple, Dict, Iterator, Any, Sequence, Union, Collection
			
 
				 from warnings import warn
			
 
				 
			
@@ -265,16 +266,17 @@ class LocalStorage:
 
				 
			
 
				     def __init__(self, maxsize: Optional[int] = None):
			
 
				         self.cache_size = maxsize or float("inf")
			
 
				-        self.data = dict()
			
 
				-        self.expiration_heap = []
			
 
				-        self.key_to_heap = dict()
			
 
				-
			
 
				-    def remove_outdated(self):
			
 
				-        while self.expiration_heap and (self.expiration_heap[0][0] < get_dht_time()
			
 
				-                                        or len(self.expiration_heap) > self.cache_size):
			
 
				+        self.data: Dict[DHTID, Tuple[BinaryDHTValue, DHTExpiration]] = dict()
			
 
				+        self.expiration_heap: List[Tuple[DHTExpiration, DHTID]] = []
			
 
				+        self.key_to_heap: Dict[DHTID, Tuple[DHTExpiration, DHTID]] = dict()
			
 
				+        self.frozen = False  # if True, do not remove outdated elements
			
 
				+
			
 
				+    def _remove_outdated(self):
			
 
				+        while not self.frozen and self.expiration_heap and (self.expiration_heap[0][0] < get_dht_time()
			
 
				+                                                            or len(self.expiration_heap) > self.cache_size):
			
 
				             heap_entry = heapq.heappop(self.expiration_heap)
			
 
				             key = heap_entry[1]
			
 
				-            if self.key_to_heap[key] == heap_entry:
			
 
				+            if self.key_to_heap.get(key) == heap_entry:
			
 
				                 del self.data[key], self.key_to_heap[key]
			
 
				 
			
 
				     def store(self, key: DHTID, value: BinaryDHTValue, expiration_time: DHTExpiration) -> bool:
			
@@ -282,7 +284,7 @@ class LocalStorage:
 
				         Store a (key, value) pair locally at least until expiration_time. See class docstring for details.
			
 
				         :returns: True if new value was stored, False it was rejected (current value is newer)
			
 
				         """
			
 
				-        if expiration_time < get_dht_time():
			
 
				+        if expiration_time < get_dht_time() and not self.frozen:
			
 
				             return False
			
 
				         self.key_to_heap[key] = (expiration_time, key)
			
 
				         heapq.heappush(self.expiration_heap, (expiration_time, key))
			
@@ -292,17 +294,53 @@ class LocalStorage:
 
				                 return True
			
 
				             return False
			
 
				         self.data[key] = (value, expiration_time)
			
 
				-        self.remove_outdated()
			
 
				+        self._remove_outdated()
			
 
				         return True
			
 
				 
			
 
				     def get(self, key: DHTID) -> (Optional[BinaryDHTValue], Optional[DHTExpiration]):
			
 
				         """ Get a value corresponding to a key if that (key, value) pair was previously stored here. """
			
 
				-        self.remove_outdated()
			
 
				+        self._remove_outdated()
			
 
				         if key in self.data:
			
 
				             return self.data[key]
			
 
				         return None, None
			
 
				 
			
 
				     def items(self) -> Iterator[Tuple[DHTID, BinaryDHTValue, DHTExpiration]]:
			
 
				         """ Iterate over (key, value, expiration_time) tuples stored in this storage """
			
 
				-        self.remove_outdated()
			
 
				+        self._remove_outdated()
			
 
				         return ((key, value, expiration_time) for key, (value, expiration_time) in self.data.items())
			
 
				+
			
 
				+    def top(self) -> Optional[Tuple[DHTID, BinaryDHTValue, DHTExpiration]]:
			
 
				+        """ Return the entry with earliest expiration or None if there isn't any """
			
 
				+        self._remove_outdated()
			
 
				+        if self.data:
			
 
				+            top_entry, top_key = self.expiration_heap[0], self.expiration_heap[0][1]
			
 
				+            while self.key_to_heap.get(top_key) != top_entry:
			
 
				+                heapq.heappop(self.expiration_heap)  # skip leftover "ghost" entries until first real entry
			
 
				+                top_entry, top_key = self.expiration_heap[0], self.expiration_heap[0][1]
			
 
				+            value, expiration = self.data[top_key]
			
 
				+            return top_key, value, expiration
			
 
				+
			
 
				+    def __contains__(self, key: DHTID):
			
 
				+        self._remove_outdated()
			
 
				+        return key in self.data
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        self._remove_outdated()
			
 
				+        return len(self.data)
			
 
				+
			
 
				+    def __delitem__(self, key: DHTID):
			
 
				+        if key in self.key_to_heap:
			
 
				+            del self.data[key], self.key_to_heap[key]
			
 
				+        # note: key may still be in self.expiration_heap, but it will not be used and eventually ._remove_outdated()
			
 
				+
			
 
				+    def __bool__(self):
			
 
				+        return bool(self.data)
			
 
				+
			
 
				+    @contextmanager
			
 
				+    def freeze(self):
			
 
				+        """ Temporarily cease to ._remove_outdated() elements inside this context to ensure consistency """
			
 
				+        prev_frozen, self.frozen = self.frozen, True
			
 
				+        try:
			
 
				+            yield self
			
 
				+        finally:
			
 
				+            self.frozen = prev_frozen
			
--- a/hivemind/dht/traverse.py
+++ b/hivemind/dht/traverse.py
@@ -11,7 +11,7 @@ ROOT = 0  # alias for heap root
 
				 
			
 
				 async def simple_traverse_dht(query_id: DHTID, initial_nodes: Collection[DHTID], beam_size: int,
			
 
				                               get_neighbors: Callable[[DHTID], Awaitable[Tuple[Collection[DHTID], bool]]],
			
 
				-                              visited_nodes: Collection[DHTID] = ()) -> Tuple[List[DHTID], Set[DHTID]]:
			
 
				+                              visited_nodes: Collection[DHTID] = ()) -> Tuple[Tuple[DHTID], Set[DHTID]]:
			
 
				     """
			
 
				     Traverse the DHT graph using get_neighbors function, find :beam_size: nearest nodes according to DHTID.xor_distance.
			
 
				 
			
@@ -64,7 +64,7 @@ async def simple_traverse_dht(query_id: DHTID, initial_nodes: Collection[DHTID],
 
				 
			
 
				 async def traverse_dht(
			
 
				         queries: Collection[DHTID], initial_nodes: List[DHTID], beam_size: int, num_workers: int, queries_per_call: int,
			
 
				-        get_neighbors: Callable[[DHTID, Collection[DHTID]], Awaitable[Dict[DHTID, Tuple[List[DHTID], bool]]]],
			
 
				+        get_neighbors: Callable[[DHTID, Collection[DHTID]], Awaitable[Dict[DHTID, Tuple[Tuple[DHTID], bool]]]],
			
 
				         found_callback: Optional[Callable[[DHTID, List[DHTID], Set[DHTID]], Awaitable[Any]]] = None,
			
 
				         await_all_tasks: bool = True, visited_nodes: Optional[Dict[DHTID, Set[DHTID]]] = (),
			
 
				 ) -> Tuple[Dict[DHTID, List[DHTID]], Dict[DHTID, Set[DHTID]]]:
			
@@ -90,8 +90,9 @@ async def traverse_dht(
 
				         The search terminates iff each query is either stopped via should_stop or finds beam_size nearest nodes.
			
 
				 
			
 
				     :param found_callback: if specified, call this callback for each finished query the moment it finishes or is stopped
			
 
				-        More specifically, run asyncio.create_task(found_found_callback(query, nearest_to_query, visited_for_query))
			
 
				+        More specifically, run asyncio.create_task(found_callback(query, nearest_to_query, visited_for_query))
			
 
				         Using this callback allows one to process results faster before traverse_dht is finishes for all queries.
			
 
				+        It is guaranteed that found_callback will be called exactly once on each query in queries.
			
 
				 
			
 
				     :param await_all_tasks: if True, wait for all tasks to finish before returning, otherwise returns after finding
			
 
				         nearest neighbors and finishes the remaining tasks (callbacks and queries to known-but-unvisited nodes)
			
@@ -133,10 +134,14 @@ async def traverse_dht(
 
				 
			
 
				     def heuristic_priority(heap_query: DHTID):
			
 
				         """ Workers prioritize expanding nodes that lead to under-explored queries (by other workers) """
			
 
				-        if len(candidate_nodes[heap_query]) == 0:
			
 
				-            return float('inf'), float('inf')
			
 
				-        else:  # prefer candidates in heaps with least number of concurrent workers, break ties by distance to query
			
 
				+        if has_candidates(heap_query):
			
 
				+            # prefer candidates in heaps with least number of concurrent workers, break ties by distance to query
			
 
				             return active_workers[heap_query], candidate_nodes[heap_query][ROOT][0]
			
 
				+        return float('inf'), float('inf')  # try not to explore vertices with no candidates
			
 
				+
			
 
				+    def has_candidates(query: DHTID):
			
 
				+        """ Whether this query's heap contains at least one candidate node that can be explored """
			
 
				+        return candidate_nodes[query] and candidate_nodes[query][ROOT][0] <= upper_bound(query)
			
 
				 
			
 
				     def upper_bound(query: DHTID):
			
 
				         """ Any node that is farther from query than upper_bound(query) will not be added to heaps """
			
@@ -156,7 +161,8 @@ async def traverse_dht(
 
				             # select the heap based on priority
			
 
				             chosen_query: DHTID = min(unfinished_queries, key=heuristic_priority)
			
 
				 
			
 
				-            if len(candidate_nodes[chosen_query]) == 0:  # if there are no peers to explore...
			
 
				+            # if there are no peers to explore...
			
 
				+            if not has_candidates(chosen_query):
			
 
				                 other_workers_pending = active_workers.most_common(1)[0][1] > 0
			
 
				                 if other_workers_pending:  # ... wait for other workers (if any) or add more peers
			
 
				                     heap_updated_event.clear()
			
@@ -169,10 +175,9 @@ async def traverse_dht(
 
				 
			
 
				             # select vertex to be explored
			
 
				             chosen_distance_to_query, chosen_peer = heapq.heappop(candidate_nodes[chosen_query])
			
 
				-            if chosen_peer in visited_nodes[chosen_query]:
			
 
				-                continue
			
 
				-            if chosen_distance_to_query > upper_bound(chosen_query):
			
 
				-                finish_search(chosen_query)
			
 
				+            if chosen_peer in visited_nodes[chosen_query] or chosen_distance_to_query > upper_bound(chosen_query):
			
 
				+                if chosen_distance_to_query > upper_bound(chosen_query) and active_workers[chosen_query] == 0:
			
 
				+                    finish_search(chosen_query)
			
 
				                 continue
			
 
				 
			
 
				             # find additional queries to pack in the same request
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ torch>=1.3.0
 
				 numpy>=1.17
			
 
				 prefetch_generator>=1.0.1
			
 
				 umsgpack
			
 
				+sortedcontainers
			
 
				 uvloop>=0.14.0
			
 
				 grpcio>=1.31
			
 
				 grpcio-tools>=1.30.0
			
--- a/tests/test_dht_experts.py
+++ b/tests/test_dht_experts.py
@@ -0,0 +1,76 @@
 
				+import random
			
 
				+import uuid
			
 
				+from itertools import chain
			
 
				+
			
 
				+import hivemind
			
 
				+from hivemind import LOCALHOST
			
 
				+
			
 
				+
			
 
				+def test_hivemind_dht():
			
 
				+    peers = [hivemind.DHT(start=True)]
			
 
				+    for i in range(10):
			
 
				+        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				+        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))
			
 
				+
			
 
				+    you: hivemind.dht.DHT = random.choice(peers)
			
 
				+    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)
			
 
				+
			
 
				+    expert_uids = [str(uuid.uuid4()) for _ in range(110)]
			
 
				+    batch_size = 10
			
 
				+    for batch_start in range(0, len(expert_uids), batch_size):
			
 
				+        you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost', 1234)
			
 
				+
			
 
				+    found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
			
 
				+    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
			
 
				+    assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				+
			
 
				+    that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint(1000, 9999)
			
 
				+    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}')
			
 
				+    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
			
 
				+    assert isinstance(you_found, hivemind.RemoteExpert)
			
 
				+    assert you_found.endpoint == f'that_host:{that_guys_port}'
			
 
				+
			
 
				+    # test first_k_active
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10)) == expert_uids[:10]
			
 
				+
			
 
				+    some_permuted_experts = random.sample(expert_uids, k=32)
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=32)) == some_permuted_experts
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=1)) == some_permuted_experts[:1]
			
 
				+    fake_and_real_experts = list(chain(*zip(
			
 
				+        [str(uuid.uuid4()) for _ in some_permuted_experts], some_permuted_experts)))
			
 
				+    assert list(theguyshetoldyounottoworryabout.first_k_active(fake_and_real_experts, k=9)) == some_permuted_experts[:9]
			
 
				+
			
 
				+    for peer in peers:
			
 
				+        peer.shutdown()
			
 
				+
			
 
				+
			
 
				+def test_first_k_active():
			
 
				+    node = hivemind.DHT(start=True)
			
 
				+    assert all(node.declare_experts(['e.1.2.3', 'e.1.2.4', 'e.3.4.5'], endpoint=f"{hivemind.LOCALHOST}:1337"))
			
 
				+    assert all(node.declare_experts(['e.2.1.1'], endpoint=f"{hivemind.LOCALHOST}:1338"))
			
 
				+
			
 
				+    results = node.first_k_active(['e.0', 'e.1', 'e.2', 'e.3'], k=2)
			
 
				+    assert len(results) == 2 and next(iter(results.keys())) == 'e.1'
			
 
				+    assert results['e.1'].uid in ('e.1.2.3', 'e.1.2.4') and results['e.1'].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+    assert results['e.2'].uid == 'e.2.1.1' and results['e.2'].endpoint == f"{hivemind.LOCALHOST}:1338"
			
 
				+
			
 
				+    results = node.first_k_active(['e', 'e.1', 'e.1.2', 'e.1.2.3'], k=10)
			
 
				+    assert len(results) == 4
			
 
				+    assert 'e' in results
			
 
				+    for k in ('e.1', 'e.1.2', 'e.1.2.3'):
			
 
				+        assert results[k].uid in ('e.1.2.3', 'e.1.2.4') and results[k].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+
			
 
				+
			
 
				+def test_dht_single_node():
			
 
				+    node = hivemind.DHT(start=True)
			
 
				+    assert node.first_k_active(['e3', 'e2'], k=3) == {}
			
 
				+    assert node.get_experts(['e3', 'e2']) == [None, None]
			
 
				+
			
 
				+    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
 
				+    for expert in node.get_experts(['e3', 'e2']):
			
 
				+        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+    active_found = node.first_k_active(['e0', 'e1', 'e3', 'e5', 'e2'], k=2)
			
 
				+    assert list(active_found.keys()) == ['e1', 'e3']
			
 
				+    assert all(expert.uid.startswith(prefix) for prefix, expert in active_found.items())
			
 
				+
			
 
				+    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
--- a/tests/test_dht_node.py
+++ b/tests/test_dht_node.py
@@ -1,10 +1,7 @@
 
				-import time
			
 
				 import asyncio
			
 
				 import multiprocessing as mp
			
 
				 import random
			
 
				 import heapq
			
 
				-import uuid
			
 
				-from itertools import chain
			
 
				 from typing import Optional
			
 
				 import numpy as np
			
 
				 
			
@@ -13,7 +10,7 @@ from typing import List, Dict
 
				 
			
 
				 from hivemind import get_dht_time
			
 
				 from hivemind.dht.node import DHTID, Endpoint, DHTNode, LOCALHOST, DHTProtocol
			
 
				-from hivemind.dht.protocol import LocalStorage
			
 
				+from hivemind.dht.protocol import DHTProtocol
			
 
				 
			
 
				 
			
 
				 def run_protocol_listener(port: int, dhtid: DHTID, started: mp.synchronize.Event, ping: Optional[Endpoint] = None):
			
@@ -265,111 +262,94 @@ def test_dht_node():
 
				         proc.terminate()
			
 
				 
			
 
				 
			
 
				-def test_hivemind_dht():
			
 
				-    peers = [hivemind.DHT(start=True)]
			
 
				-    for i in range(10):
			
 
				-        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				-        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))
			
 
				-
			
 
				-    you: hivemind.dht.DHT = random.choice(peers)
			
 
				-    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)
			
 
				-
			
 
				-    expert_uids = [str(uuid.uuid4()) for _ in range(110)]
			
 
				-    batch_size = 10
			
 
				-    for batch_start in range(0, len(expert_uids), batch_size):
			
 
				-        you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost', 1234)
			
 
				-
			
 
				-    found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
			
 
				-    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
			
 
				-    assert all(res is None for res in found[-2:]), "Found non-existing experts"
			
 
				-
			
 
				-    that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint(1000, 9999)
			
 
				-    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}')
			
 
				-    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
			
 
				-    assert isinstance(you_found, hivemind.RemoteExpert)
			
 
				-    assert you_found.endpoint == f'that_host:{that_guys_port}'
			
 
				-
			
 
				-    # test first_k_active
			
 
				-    assert list(theguyshetoldyounottoworryabout.first_k_active(expert_uids, k=10)) == expert_uids[:10]
			
 
				-
			
 
				-    some_permuted_experts = random.sample(expert_uids, k=32)
			
 
				-    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=32)) == some_permuted_experts
			
 
				-    assert list(theguyshetoldyounottoworryabout.first_k_active(some_permuted_experts, k=1)) == some_permuted_experts[:1]
			
 
				-    fake_and_real_experts = list(chain(*zip(
			
 
				-        [str(uuid.uuid4()) for _ in some_permuted_experts], some_permuted_experts)))
			
 
				-    assert list(theguyshetoldyounottoworryabout.first_k_active(fake_and_real_experts, k=9)) == some_permuted_experts[:9]
			
 
				-
			
 
				-    for peer in peers:
			
 
				-        peer.shutdown()
			
 
				-
			
 
				-
			
 
				-def test_dht_single_node():
			
 
				-    node = hivemind.DHT(start=True)
			
 
				-    assert node.first_k_active(['e3', 'e2'], k=3) == {}
			
 
				-    assert node.get_experts(['e3', 'e2']) == [None, None]
			
 
				-
			
 
				-    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
 
				-    for expert in node.get_experts(['e3', 'e2']):
			
 
				-        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				-    active_found = node.first_k_active(['e0', 'e1', 'e3', 'e5', 'e2'], k=2)
			
 
				-    assert list(active_found.keys()) == ['e1', 'e3']
			
 
				-    assert all(expert.uid.startswith(prefix) for prefix, expert in active_found.items())
			
 
				-
			
 
				-    assert all(node.declare_experts(['e1', 'e2', 'e3'], f"{hivemind.LOCALHOST}:1337"))
			
 
				-
			
 
				-
			
 
				-def test_first_k_active():
			
 
				-    node = hivemind.DHT(start=True)
			
 
				-    assert all(node.declare_experts(['e.1.2.3', 'e.1.2.4', 'e.3.4.5'], endpoint=f"{hivemind.LOCALHOST}:1337"))
			
 
				-    assert all(node.declare_experts(['e.2.1.1'], endpoint=f"{hivemind.LOCALHOST}:1338"))
			
 
				+def test_dhtnode_caching(T=0.05):
			
 
				+    test_success = mp.Event()
			
 
				 
			
 
				-    results = node.first_k_active(['e.0', 'e.1', 'e.2', 'e.3'], k=2)
			
 
				-    assert len(results) == 2 and next(iter(results.keys())) == 'e.1'
			
 
				-    assert results['e.1'].uid in ('e.1.2.3', 'e.1.2.4') and results['e.1'].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				-    assert results['e.2'].uid == 'e.2.1.1' and results['e.2'].endpoint == f"{hivemind.LOCALHOST}:1338"
			
 
				+    async def _tester():
			
 
				+        node2 = await hivemind.DHTNode.create(cache_refresh_before_expiry=5 * T, reuse_get_requests=False)
			
 
				+        node1 = await hivemind.DHTNode.create(initial_peers=[f'localhost:{node2.port}'],
			
 
				+                                              cache_refresh_before_expiry=5 * T, listen=False, reuse_get_requests=False)
			
 
				+        await node2.store('k', [123, 'value'], expiration_time=hivemind.get_dht_time() + 7 * T)
			
 
				+        await node2.store('k2', [654, 'value'], expiration_time=hivemind.get_dht_time() + 7 * T)
			
 
				+        await node2.store('k3', [654, 'value'], expiration_time=hivemind.get_dht_time() + 15 * T)
			
 
				+        await node1.get_many(['k', 'k2', 'k3', 'k4'])
			
 
				+        assert len(node1.protocol.cache) == 3
			
 
				+        assert len(node1.cache_refresh_queue) == 0
			
 
				+
			
 
				+        await node1.get_many(['k', 'k2', 'k3', 'k4'])
			
 
				+        assert len(node1.cache_refresh_queue) == 3
			
 
				+
			
 
				+        await node2.store('k', [123, 'value'], expiration_time=hivemind.get_dht_time() + 12 * T)
			
 
				+        await asyncio.sleep(4 * T)
			
 
				+        await node1.get('k')
			
 
				+        await asyncio.sleep(1 * T)
			
 
				+
			
 
				+        assert len(node1.protocol.cache) == 3
			
 
				+        assert len(node1.cache_refresh_queue) == 2
			
 
				+        await asyncio.sleep(3 * T)
			
 
				+
			
 
				+        assert len(node1.cache_refresh_queue) == 1
			
 
				+
			
 
				+        await asyncio.sleep(5 * T)
			
 
				+        assert len(node1.cache_refresh_queue) == 0
			
 
				+        await asyncio.sleep(5 * T)
			
 
				+        assert len(node1.cache_refresh_queue) == 0
			
 
				+
			
 
				+        await node2.store('k', [123, 'value'], expiration_time=hivemind.get_dht_time() + 10 * T)
			
 
				+        await node1.get('k')
			
 
				+        await asyncio.sleep(1 * T)
			
 
				+        assert len(node1.cache_refresh_queue) == 0
			
 
				+        await node1.get('k')
			
 
				+        await asyncio.sleep(1 * T)
			
 
				+        assert len(node1.cache_refresh_queue) == 1
			
 
				+
			
 
				+        await asyncio.sleep(5 * T)
			
 
				+        assert len(node1.cache_refresh_queue) == 0
			
 
				+
			
 
				+        await asyncio.gather(node1.shutdown(), node2.shutdown())
			
 
				+        test_success.set()
			
 
				 
			
 
				-    results = node.first_k_active(['e', 'e.1', 'e.1.2', 'e.1.2.3'], k=10)
			
 
				-    assert len(results) == 4
			
 
				-    assert 'e' in results
			
 
				-    for k in ('e.1', 'e.1.2', 'e.1.2.3'):
			
 
				-        assert results[k].uid in ('e.1.2.3', 'e.1.2.4') and results[k].endpoint == f"{hivemind.LOCALHOST}:1337"
			
 
				+    proc = mp.Process(target=lambda: asyncio.run(_tester()))
			
 
				+    proc.start()
			
 
				+    proc.join()
			
 
				+    assert test_success.is_set()
			
 
				 
			
 
				 
			
 
				+def test_dhtnode_reuse_get():
			
 
				+    test_success = mp.Event()
			
 
				 
			
 
				-def test_store():
			
 
				-    d = LocalStorage()
			
 
				-    d.store(DHTID.generate("key"), b"val", get_dht_time() + 0.5)
			
 
				-    assert d.get(DHTID.generate("key"))[0] == b"val", "Wrong value"
			
 
				-    print("Test store passed")
			
 
				+    async def _tester():
			
 
				+        peers = []
			
 
				+        for i in range(10):
			
 
				+            neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
			
 
				+            peers.append(await hivemind.DHTNode.create(initial_peers=neighbors_i, parallel_rpc=256))
			
 
				 
			
 
				+        await asyncio.gather(
			
 
				+            random.choice(peers).store('k1', 123, hivemind.get_dht_time() + 999),
			
 
				+            random.choice(peers).store('k2', 567, hivemind.get_dht_time() + 999)
			
 
				+        )
			
 
				 
			
 
				-def test_get_expired():
			
 
				-    d = LocalStorage()
			
 
				-    d.store(DHTID.generate("key"), b"val", get_dht_time() + 0.1)
			
 
				-    time.sleep(0.5)
			
 
				-    assert d.get(DHTID.generate("key")) == (None, None), "Expired value must be deleted"
			
 
				-    print("Test get expired passed")
			
 
				+        you = random.choice(peers)
			
 
				 
			
 
				+        futures1 = await you.get_many(['k1', 'k2'], return_futures=True)
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k1')]) == 1
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k2')]) == 1
			
 
				 
			
 
				-def test_get_empty():
			
 
				-    d = LocalStorage()
			
 
				-    assert d.get(DHTID.generate(source="key")) == (None, None), "LocalStorage returned non-existent value"
			
 
				-    print("Test get expired passed")
			
 
				+        futures2 = await you.get_many(['k2', 'k3'], return_futures=True)
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k2')]) == 2
			
 
				 
			
 
				+        await asyncio.gather(*futures1.values(), *futures2.values())
			
 
				+        futures3 = await you.get_many(['k3'], return_futures=True)
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k1')]) == 0
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k2')]) == 0
			
 
				+        assert len(you.pending_get_requests[DHTID.generate('k3')]) == 1
			
 
				 
			
 
				-def test_change_expiration_time():
			
 
				-    d = LocalStorage()
			
 
				-    d.store(DHTID.generate("key"), b"val1", get_dht_time() + 1)
			
 
				-    assert d.get(DHTID.generate("key"))[0] == b"val1", "Wrong value"
			
 
				-    d.store(DHTID.generate("key"), b"val2", get_dht_time() + 200)
			
 
				-    time.sleep(1)
			
 
				-    assert d.get(DHTID.generate("key"))[0] == b"val2", "Value must be changed, but still kept in table"
			
 
				-    print("Test change expiration time passed")
			
 
				-
			
 
				+        assert (await futures1['k1'])[0] == 123
			
 
				+        assert await futures1['k2'] == await futures2['k2'] and (await futures1['k2'])[0] == 567
			
 
				+        assert await futures2['k3'] == await futures3['k3'] and (await futures3['k3']) == (None, None)
			
 
				+        test_success.set()
			
 
				 
			
 
				-def test_maxsize_cache():
			
 
				-    d = LocalStorage(maxsize=1)
			
 
				-    d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 1)
			
 
				-    d.store(DHTID.generate("key2"), b"val2", get_dht_time() + 200)
			
 
				-    assert d.get(DHTID.generate("key2"))[0] == b"val2", "Value with bigger exp. time must be kept"
			
 
				-    assert d.get(DHTID.generate("key1"))[0] is None, "Value with less exp time, must be deleted"
			
 
				+    proc = mp.Process(target=lambda: asyncio.run(_tester()))
			
 
				+    proc.start()
			
 
				+    proc.join()
			
 
				+    assert test_success.is_set()
			
--- a/tests/test_dht_storage.py
+++ b/tests/test_dht_storage.py
@@ -0,0 +1,79 @@
 
				+import time
			
 
				+
			
 
				+from hivemind import DHTID, get_dht_time
			
 
				+from hivemind.dht.protocol import LocalStorage
			
 
				+
			
 
				+
			
 
				+def test_store():
			
 
				+    d = LocalStorage()
			
 
				+    d.store(DHTID.generate("key"), b"val", get_dht_time() + 0.5)
			
 
				+    assert d.get(DHTID.generate("key"))[0] == b"val", "Wrong value"
			
 
				+    print("Test store passed")
			
 
				+
			
 
				+
			
 
				+def test_get_expired():
			
 
				+    d = LocalStorage()
			
 
				+    d.store(DHTID.generate("key"), b"val", get_dht_time() + 0.1)
			
 
				+    time.sleep(0.5)
			
 
				+    assert d.get(DHTID.generate("key")) == (None, None), "Expired value must be deleted"
			
 
				+    print("Test get expired passed")
			
 
				+
			
 
				+
			
 
				+def test_get_empty():
			
 
				+    d = LocalStorage()
			
 
				+    assert d.get(DHTID.generate(source="key")) == (None, None), "LocalStorage returned non-existent value"
			
 
				+    print("Test get expired passed")
			
 
				+
			
 
				+
			
 
				+def test_change_expiration_time():
			
 
				+    d = LocalStorage()
			
 
				+    d.store(DHTID.generate("key"), b"val1", get_dht_time() + 1)
			
 
				+    assert d.get(DHTID.generate("key"))[0] == b"val1", "Wrong value"
			
 
				+    d.store(DHTID.generate("key"), b"val2", get_dht_time() + 200)
			
 
				+    time.sleep(1)
			
 
				+    assert d.get(DHTID.generate("key"))[0] == b"val2", "Value must be changed, but still kept in table"
			
 
				+    print("Test change expiration time passed")
			
 
				+
			
 
				+
			
 
				+def test_maxsize_cache():
			
 
				+    d = LocalStorage(maxsize=1)
			
 
				+    d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 1)
			
 
				+    d.store(DHTID.generate("key2"), b"val2", get_dht_time() + 200)
			
 
				+    assert d.get(DHTID.generate("key2"))[0] == b"val2", "Value with bigger exp. time must be kept"
			
 
				+    assert d.get(DHTID.generate("key1"))[0] is None, "Value with less exp time, must be deleted"
			
 
				+
			
 
				+
			
 
				+def test_localstorage_top():
			
 
				+    d = LocalStorage(maxsize=3)
			
 
				+    d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 1)
			
 
				+    d.store(DHTID.generate("key2"), b"val2", get_dht_time() + 2)
			
 
				+    d.store(DHTID.generate("key3"), b"val3", get_dht_time() + 4)
			
 
				+    assert d.top()[:2] == (DHTID.generate("key1"), b"val1")
			
 
				+
			
 
				+    d.store(DHTID.generate("key1"), b"val1_new", get_dht_time() + 3)
			
 
				+    assert d.top()[:2] == (DHTID.generate("key2"), b"val2")
			
 
				+
			
 
				+    del d[DHTID.generate('key2')]
			
 
				+    assert d.top()[:2] == (DHTID.generate("key1"), b"val1_new")
			
 
				+    d.store(DHTID.generate("key2"), b"val2_new", get_dht_time() + 5)
			
 
				+    d.store(DHTID.generate("key4"), b"val4", get_dht_time() + 6)  # key4 will push out key1 due to maxsize
			
 
				+
			
 
				+    assert d.top()[:2] == (DHTID.generate("key3"), b"val3")
			
 
				+
			
 
				+
			
 
				+def test_localstorage_freeze():
			
 
				+    d = LocalStorage(maxsize=2)
			
 
				+
			
 
				+    with d.freeze():
			
 
				+        d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 0.01)
			
 
				+        assert DHTID.generate("key1") in d
			
 
				+        time.sleep(0.03)
			
 
				+        assert DHTID.generate("key1") in d
			
 
				+    assert DHTID.generate("key1") not in d
			
 
				+
			
 
				+    with d.freeze():
			
 
				+        d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 1)
			
 
				+        d.store(DHTID.generate("key2"), b"val2", get_dht_time() + 2)
			
 
				+        d.store(DHTID.generate("key3"), b"val3", get_dht_time() + 3)  # key3 will push key1 out due to maxsize
			
 
				+        assert DHTID.generate("key1") in d
			
 
				+    assert DHTID.generate("key1") not in d