5 жил өмнө · 535318e249
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -27,7 +27,7 @@ jobs:
 
															           command: sudo python setup.py develop
														
 
															           name: setup
														
 
															       - run:
														
 
															-          command: for test_file in tests/test*.py; do pytest $test_file --full-trace; done
														
 
															+          command: pytest ./tests
														
 
															           name: tests
														
 
															       - run:
														
 
															           command: python tests/benchmark_throughput.py --preset minimalistic
														
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -104,7 +104,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
															         beam = np.array([[self.uid_prefix]] * batch_size, dtype=object)  # [batch_size, up_to_beam_size]
														
 
															         scores = np.zeros([batch_size, 1], dtype=np.float64)
														
 
															-        delimeters = np.array(self.dht.UID_DELIMETER)[None, None, None]  # pre-compute numpy array for fast concat
														
 
															+        delimiters = np.array(self.dht.UID_DELIMITER)[None, None, None]  # pre-compute numpy array for fast concat
														
 
															         for dim_index, dim_scores in enumerate(grid_scores):
														
 
															             dim_scores = dim_scores.detach().cpu().numpy()
														
@@ -112,7 +112,7 @@ class RemoteMixtureOfExperts(nn.Module):
 
															             # create all possible successsors from current beam
														
 
															             dim_indices = np.arange(dim_scores.shape[1]).astype(str)
														
 
															-            new_candidates = beam[:, :, None] + delimeters + dim_indices[None, None, :]
														
 
															+            new_candidates = beam[:, :, None] + delimiters + dim_indices[None, None, :]
														
 
															             new_candidates = new_candidates.reshape([batch_size, -1])
														
 
															             new_scores = scores[:, :, None] + dim_scores[:, None, :]
														
@@ -166,8 +166,8 @@ class RemoteMixtureOfExperts(nn.Module):
 
															         grid_indices = np.zeros([len(flat_experts), len(grid_scores)], dtype=np.int64)
														
 
															         for i, expert in enumerate(flat_experts):
														
 
															-            expert_indices = expert.uid[len(self.uid_prefix) + len(self.dht.UID_DELIMETER):]
														
 
															-            expert_indices = list(map(int, expert_indices.split(self.dht.UID_DELIMETER)))
														
 
															+            expert_indices = expert.uid[len(self.uid_prefix) + len(self.dht.UID_DELIMITER):]
														
 
															+            expert_indices = list(map(int, expert_indices.split(self.dht.UID_DELIMITER)))
														
 
															             grid_indices[i] = expert_indices
														
 
															         scores_per_dim = [
														
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -16,6 +16,8 @@ import asyncio
 
															 import ctypes
														
 
															 import multiprocessing as mp
														
 
															 import warnings
														
 
															+from collections import deque
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															 from typing import List, Optional, Sequence
														
 
															 import uvloop
														
@@ -23,12 +25,12 @@ import uvloop
 
															 from hivemind.client import RemoteExpert
														
 
															 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
														
 
															 from hivemind.dht.routing import get_dht_time
														
 
															-from hivemind.utils import MPFuture, Endpoint, run_in_background
														
 
															+from hivemind.utils import MPFuture, Endpoint
														
 
															 class DHT(mp.Process):
														
 
															     """
														
 
															-    A high-level interface to hivemind DHT. Runs a dht node in a background process.
														
 
															+    High-level interface to hivemind.dht that is designed to allow RemoteMixtureOfExperts to select best experts.
														
 
															     :param initial_peers: one or multiple endpoints pointing to active DHT peers. Similar format to listen_on.
														
 
															     :param listen_on: an interface for incoming connections, e.g. "127.0.0.1:*", "0.0.0.0:1234" or "ipv6:[::]:*"
														
@@ -36,19 +38,45 @@ class DHT(mp.Process):
 
															     :param daemon: if True, the background process is marked as daemon and automatically terminated after main process
														
 
															     :param max_workers: declare_experts and get_experts will use up to this many parallel workers
														
 
															         (but no more than one per key)
														
 
															+    :param expiration: experts declared from this node expire after this many seconds (default = 5 minutes)
														
 
															+    :param receiver_threads: uses this many threads to await on input pipe. Default = 1 should be enough in most cases
														
 
															     :param kwargs: any other params will be forwarded to DHTNode upon creation
														
 
															+
														
 
															+    Each expert has an identifier in the form of {prefix}.{i}.{j}.{...}, e.g. "ffn_expert.98.76.54.32.10"
														
 
															+    An expert identifier consists of:
														
 
															+
														
 
															+        * optional prefix that determines expert role, experiment name, etc.
														
 
															+        * one or more integers that determine that expert's position in an N-dimensional grid
														
 
															+
														
 
															+    A hivemind.Server can ``DHT.declare_experts(expert_uids: List[str])`` to make its experts visible to everyone.
														
 
															+    When declaring experts, DHT will store each expert's uid and all its prefixes until :expiration: (specified at init)
														
 
															+    For instance, declaring "ffn_expert.98.76.54.32.10" will store the following keys in a DHT:
														
 
															+    ``"ffn_expert", "ffn_expert.98", "ffn_expert.98.76", ..., "ffn_expert.98.76.54.32.10"``
														
 
															+
														
 
															+    RemoteMixtureOfExperts can use these prefixes to find top-k most suitable experts with a left-to-right beam search.
														
 
															+    For instance, consider RemoteMixtureOfExperts with prefix "ffn_expert" and grid size [100, 100, 100, 100, 100].
														
 
															+    This MoE can query all experts with that prefix and arbitrary indices in 0...99 along each dimension.
														
 
															+    However, not every expert in such 100^5 grid can be alive at a given moment of time (the grid size is redundant).
														
 
															+    In order to find k best "alive" experts, MoE first ranks indices along the first dimension with its gating function.
														
 
															+    It can then check which of those indices correspond to "alive" experts by querying keys such as "ffn_expert.98".
														
 
															+    This is done using DHT.first_k_active function. After selecting k best indices along first dimension, MoE moves
														
 
															+    to the second dimension. It can find top-k pairs of indices (e.g. "expert.98.76") that start with one of k first
														
 
															+    indices from the previous step. Finally, MoE will use DHT.get_experts(uids: List[str]) search for specific experts.
														
 
															+    This beam search explores one additional dimension per step and finds k best experts from across the DHT
														
 
															+    in O(k / s * log(N)) average time where s is grid sparsity rate and N is the total number of experts.
														
 
															     """
														
 
															-    UID_DELIMETER = '.'  # splits expert uids over this delimeter
														
 
															-    EXPIRATION = 120  # anything written to DHT is considered expired after this many seconds
														
 
															-    make_key = "{}::{}".format
														
 
															+
														
 
															+    UID_DELIMITER = '.'  # when declaring experts, DHT store all prefixes of that expert's uid, split over this prefix
														
 
															+    #  formally, prefixes = {uid.split(UID_DELIMITER)[:length] for length in range(1, uid.count(UID_DELIMITER) + 2)}
														
 
															     def __init__(self, listen_on: Endpoint = "0.0.0.0:*", initial_peers: Sequence[Endpoint] = (), *, start: bool,
														
 
															-                 daemon: bool = True, max_workers: Optional[int] = None, parallel_rpc: Optional[int] = None, **kwargs):
														
 
															+                 daemon: bool = True, max_workers: Optional[int] = None, parallel_rpc: Optional[int] = None,
														
 
															+                 receiver_threads: int = 1, expiration: float = 300, **kwargs):
														
 
															         super().__init__()
														
 
															         self.listen_on, self.initial_peers, self.kwargs = listen_on, initial_peers, kwargs
														
 
															-        self.max_workers, self.parallel_rpc = max_workers, parallel_rpc
														
 
															+        self.receiver_threads, self.max_workers, self.parallel_rpc = receiver_threads, max_workers, parallel_rpc
														
 
															+        self.expiration = expiration
														
 
															         self._port = mp.Value(ctypes.c_int32, 0)  # initialized after dht starts
														
 
															-        self.node: Optional[DHTNode] = None  # initialized inside self.run only
														
 
															         self._pipe, self.pipe = mp.Pipe(duplex=True)
														
 
															         self.ready = mp.Event()
														
 
															         self.daemon = daemon
														
@@ -62,16 +90,20 @@ class DHT(mp.Process):
 
															         uvloop.install()
														
 
															         loop = asyncio.new_event_loop()
														
 
															         asyncio.set_event_loop(loop)
														
 
															-        self.node: DHTNode = loop.run_until_complete(DHTNode.create(
														
 
															-            initial_peers=list(self.initial_peers), listen_on=self.listen_on, parallel_rpc=self.parallel_rpc,
														
 
															-            num_workers=self.max_workers or 1, **self.kwargs))
														
 
															-        self._port.value = self.node.port
														
 
															-        run_in_background(loop.run_forever)
														
 
															-        self.ready.set()
														
 
															+        pipe_awaiter = ThreadPoolExecutor(self.receiver_threads)
														
 
															+
														
 
															+        async def _run():
														
 
															+            node = await DHTNode.create(
														
 
															+                initial_peers=list(self.initial_peers), listen_on=self.listen_on, parallel_rpc=self.parallel_rpc,
														
 
															+                num_workers=self.max_workers or 1, **self.kwargs)
														
 
															+            self._port.value = node.port
														
 
															+            self.ready.set()
														
 
															+
														
 
															+            while True:
														
 
															+                method, args, kwargs = await loop.run_in_executor(pipe_awaiter, self._pipe.recv)
														
 
															+                asyncio.create_task(getattr(self, method)(node, *args, **kwargs))
														
 
															-        while True:
														
 
															-            method, args, kwargs = self._pipe.recv()
														
 
															-            getattr(self, method)(*args, **kwargs)
														
 
															+        loop.run_until_complete(_run())
														
 
															     def run_in_background(self, await_ready=True, timeout=None):
														
 
															         """
														
@@ -85,7 +117,7 @@ class DHT(mp.Process):
 
															     def shutdown(self) -> None:
														
 
															         """ Shuts down the dht process """
														
 
															         if self.is_alive():
														
 
															-            self.kill()
														
 
															+            self.terminate()
														
 
															         else:
														
 
															             warnings.warn("DHT shutdown has no effect: dht process is already not alive")
														
@@ -93,32 +125,27 @@ class DHT(mp.Process):
 
															     def port(self) -> Optional[int]:
														
 
															         return self._port.value if self._port.value != 0 else None
														
 
															-    def get_experts(self, uids: List[str], expiration=None) -> List[Optional[RemoteExpert]]:
														
 
															+    def get_experts(self, uids: List[str], expiration_time: Optional[DHTExpiration] = None,
														
 
															+                    wait=True) -> List[Optional[RemoteExpert]]:
														
 
															         """
														
 
															         :param uids: find experts with these ids from across the DHT
														
 
															-        :param expiration: returns experts that expire no sooner than this (based on get_dht_time), default = now
														
 
															+        :param expiration_time: if specified, return experts that expire no sooner than this (based on get_dht_time)
														
 
															+        :param wait: if True (default), return when experts are returned. Otherwise return a Future.
														
 
															         :returns: a list of [RemoteExpert if found else None]
														
 
															         """
														
 
															+        assert not isinstance(uids, str), "Please send a list / tuple of expert uids."
														
 
															         future, _future = MPFuture.make_pair()
														
 
															-        self.pipe.send(('_get_experts', [], dict(uids=uids, expiration=expiration, future=_future)))
														
 
															-        return future.result()
														
 
															+        self.pipe.send(('_get_experts', [], dict(uids=uids, expiration_time=expiration_time, future=_future)))
														
 
															+        return future.result() if wait else future
														
 
															-    def _get_experts(self, uids: List[str], expiration: Optional[DHTExpiration], future: MPFuture):
														
 
															-        loop = asyncio.get_event_loop()
														
 
															-        expiration = expiration or get_dht_time()
														
 
															+    async def _get_experts(
														
 
															+            self, node: DHTNode, uids: List[str], expiration_time: Optional[DHTExpiration], future: MPFuture):
														
 
															+        if expiration_time is None:
														
 
															+            expiration_time = get_dht_time()
														
 
															         num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
														
 
															-        keys = [self.make_key('expert', uid) for uid in uids]
														
 
															-
														
 
															-        response = asyncio.run_coroutine_threadsafe(
														
 
															-            self.node.get_many(keys, expiration, num_workers=num_workers), loop).result()
														
 
															-
														
 
															-        experts: List[Optional[RemoteExpert]] = [None] * len(uids)
														
 
															-        for i, (key, uid) in enumerate(zip(keys, uids)):
														
 
															-            maybe_endpoint, maybe_expiration = response[key]
														
 
															-            if maybe_expiration is not None:  # if we found a value
														
 
															-                experts[i] = RemoteExpert(uid=uid, endpoint=maybe_endpoint)
														
 
															-
														
 
															-        future.set_result(experts)
														
 
															+        response = await node.get_many(uids, expiration_time, num_workers=num_workers)
														
 
															+        future.set_result([RemoteExpert(uid, maybe_endpoint) if maybe_expiration_time else None
														
 
															+                           for uid, (maybe_endpoint, maybe_expiration_time) in response.items()])
														
 
															     def declare_experts(self, uids: List[str], endpoint: Endpoint, wait=True, timeout=None) -> Optional[List[bool]]:
														
 
															         """
														
@@ -136,69 +163,70 @@ class DHT(mp.Process):
 
															         if wait:
														
 
															             return future.result(timeout)
														
 
															-    def _declare_experts(self, uids: List[str], endpoint: Endpoint, future: Optional[MPFuture]):
														
 
															-        assert self.node is not None, "This method should only be accessed from inside .run method"
														
 
															+    async def _declare_experts(self, node: DHTNode, uids: List[str], endpoint: Endpoint, future: Optional[MPFuture]):
														
 
															         num_workers = len(uids) if self.max_workers is None else min(len(uids), self.max_workers)
														
 
															-        loop = asyncio.get_event_loop()
														
 
															-        expiration_time = get_dht_time() + self.EXPIRATION
														
 
															-        unique_prefixes = set()
														
 
															+        expiration_time = get_dht_time() + self.expiration
														
 
															-        keys, values = [], []
														
 
															+        data_to_store = {}
														
 
															         for uid in uids:
														
 
															-            uid_parts = uid.split(self.UID_DELIMETER)
														
 
															-            keys.append(self.make_key('expert', uid))
														
 
															-            values.append(endpoint)
														
 
															-            unique_prefixes.update([self.UID_DELIMETER.join(uid_parts[:i + 1]) for i in range(len(uid_parts))])
														
 
															-
														
 
															-        for prefix in unique_prefixes:
														
 
															-            keys.append(self.make_key('prefix', prefix))
														
 
															-            values.append(True)
														
 
															-
														
 
															-        store_ok = asyncio.run_coroutine_threadsafe(
														
 
															-            self.node.store_many(keys, values, expiration_time, num_workers=num_workers), loop
														
 
															-        ).result()
														
 
															+            uid_parts = uid.split(self.UID_DELIMITER)
														
 
															+            for i in range(len(uid_parts)):
														
 
															+                uid_prefix_i = self.UID_DELIMITER.join(uid_parts[:i + 1])
														
 
															+                data_to_store[uid_prefix_i] = endpoint
														
 
															+
														
 
															+        store_keys, store_values = zip(*data_to_store.items())
														
 
															+        store_ok = await node.store_many(store_keys, store_values, expiration_time, num_workers=num_workers)
														
 
															         if future is not None:
														
 
															-            future.set_result([store_ok[key] for key in keys])
														
 
															+            future.set_result([store_ok[key] for key in data_to_store.keys()])
														
 
															-    def first_k_active(self, prefixes: List[str], k: int, max_prefetch=None):
														
 
															+    def first_k_active(self, uid_prefixes: List[str], k: int, max_prefetch: int = 1, chunk_size: Optional[int] = None):
														
 
															         """
														
 
															         Find k prefixes with active experts; may return less if there aren't enough; used for DMoE beam search
														
 
															-        :param prefixes: a list of uid prefixes ordered from highest to lowest priority
														
 
															+        :param uid_prefixes: a list of uid prefixes ordered from highest to lowest priority
														
 
															         :param k: return at most *this many* active prefixes
														
 
															-        :param max_prefetch: pre-dispatch up to *this many* asynchronous expert requests, defaults to pre-dispatch = k
														
 
															+        :param max_prefetch: pre-dispatch up to *this many* tasks (each for chunk_size experts)
														
 
															+        :param chunk_size: dispatch this many requests in one task
														
 
															         :returns: a list of at most :k: prefixes that have at least one active expert each;
														
 
															         """
														
 
															-        assert isinstance(prefixes, (list, tuple)), "please provide a list/tuple of prefixes as the first argument"
														
 
															+        assert not isinstance(uid_prefixes, str), "please provide a list/tuple of prefixes as the first argument"
														
 
															         future, _future = MPFuture.make_pair()
														
 
															         self.pipe.send(('_first_k_active', [],
														
 
															-                        dict(prefixes=prefixes, k=k, max_prefetch=max_prefetch or k, future=_future)))
														
 
															+                        dict(uid_prefixes=uid_prefixes, k=k, max_prefetch=max_prefetch,
														
 
															+                             chunk_size=chunk_size or k, future=_future)))
														
 
															         return future.result()
														
 
															-    def _first_k_active(self, prefixes: List[str], k: int, max_prefetch: Optional[int], future: MPFuture):
														
 
															-        assert self.node is not None, "This method should only be accessed from inside .run method"
														
 
															-        max_prefetch = max_prefetch or len(prefixes)
														
 
															-        loop = asyncio.get_event_loop()
														
 
															-        lookup_prefetch = [asyncio.run_coroutine_threadsafe(self.node.get(self.make_key('prefix', prefix)), loop)
														
 
															-                           for prefix in prefixes[:max_prefetch]]
														
 
															+    async def _first_k_active(
														
 
															+            self, node: DHTNode, uid_prefixes: List[str], k: int, max_prefetch: int, chunk_size: int, future: MPFuture):
														
 
															+        num_workers_per_chunk = min(chunk_size, self.max_workers or chunk_size)
														
 
															+        total_chunks = (len(uid_prefixes) - 1) // chunk_size + 1
														
 
															         active_prefixes = []
														
 
															-        for i, prefix in enumerate(prefixes):
														
 
															-            _, maybe_expiration = lookup_prefetch[i].result()
														
 
															-
														
 
															-            if maybe_expiration is not None:
														
 
															-                active_prefixes.append(prefix)
														
 
															-                if len(active_prefixes) >= k:
														
 
															-                    future.set_result(active_prefixes)
														
 
															-                    for task in lookup_prefetch[i:]:
														
 
															-                        task.cancel()
														
 
															-                    return
														
 
															-
														
 
															-            # pre-dispatch the next request in line
														
 
															-            if len(lookup_prefetch) < len(prefixes):
														
 
															-                lookup_prefetch.append(
														
 
															-                    asyncio.run_coroutine_threadsafe(
														
 
															-                        self.node.get(self.make_key('prefix', prefixes[len(lookup_prefetch)])), loop))
														
 
															-
														
 
															-        # could not find enough active prefixes; return what we can
														
 
															+        pending_tasks = deque(
														
 
															+            asyncio.create_task(node.get_many(uid_prefixes[chunk_i * chunk_size: (chunk_i + 1) * chunk_size],
														
 
															+                                              num_workers=num_workers_per_chunk))
														
 
															+            for chunk_i in range(min(max_prefetch + 1, total_chunks))
														
 
															+        )  # pre-dispatch first task and up to max_prefetch additional tasks
														
 
															+
														
 
															+        for chunk_i in range(total_chunks):
														
 
															+            # parse task results in chronological order, launch additional tasks on demand
														
 
															+            response = await pending_tasks.popleft()
														
 
															+            for uid_prefix in uid_prefixes[chunk_i * chunk_size: (chunk_i + 1) * chunk_size]:
														
 
															+                if response[uid_prefix][1] is not None:  # found active peer
														
 
															+                    active_prefixes.append(uid_prefix)
														
 
															+                    # if we found enough active experts, finish immediately
														
 
															+                    if len(active_prefixes) >= k:
														
 
															+                        break
														
 
															+            if len(active_prefixes) >= k:
														
 
															+                for task in pending_tasks:
														
 
															+                    task.cancel()
														
 
															+                break
														
 
															+
														
 
															+            pre_dispatch_chunk_i = chunk_i + len(pending_tasks) + 1
														
 
															+            if pre_dispatch_chunk_i < total_chunks:
														
 
															+                pending_tasks.append(asyncio.create_task(node.get_many(
														
 
															+                    uid_prefixes[pre_dispatch_chunk_i * chunk_size: (pre_dispatch_chunk_i + 1) * chunk_size],
														
 
															+                    num_workers=num_workers_per_chunk)))
														
 
															+
														
 
															+        # return k active prefixes or as many as we could find
														
 
															         future.set_result(active_prefixes)
														
--- a/hivemind/dht/dht.proto
+++ b/hivemind/dht/dht.proto
@@ -17,39 +17,39 @@ service DHT {
 
															 message NodeInfo {
														
 
															   // note: both node_id and port are optional: if specified, ask peer to add you to its routing table;
														
 
															   // if either node_id or port is absent, simply request recipient info (for client-only mode)
														
 
															-  bytes node_id = 1;                // sender's own node id serialized with DHTID.to_bytes()
														
 
															-  int32 rpc_port = 2;               // port to which sender listens for DHT RPCs
														
 
															+  bytes node_id = 1;                   // sender's own node id serialized with DHTID.to_bytes()
														
 
															+  int32 rpc_port = 2;                  // port to which sender listens for DHT RPCs
														
 
															 }
														
 
															 message StoreRequest {
														
 
															   // three lists of the same length representing dht keys, dht values and expiration
														
 
															-  repeated bytes keys = 1;          // keys in the form of DHTID.generate(raw_key).to_bytes()
														
 
															-  repeated bytes values = 2;        // binary-encoded value for i-th key
														
 
															-  repeated double expiration = 3;   // expirations for i-th key (type = DHTExpiration)
														
 
															-  repeated bool in_cache = 4;       // if in_cache[i], store i-th key in cache, else store normally
														
 
															-  NodeInfo peer = 5;                // (optional) sender's own node info, same behavior as in DHT.rpc_ping
														
 
															+  repeated bytes keys = 1;             // keys in the form of DHTID.generate(raw_key).to_bytes()
														
 
															+  repeated bytes values = 2;           // binary-encoded value for i-th key
														
 
															+  repeated double expiration_time = 3; // expirations for i-th key (type = DHTExpiration)
														
 
															+  repeated bool in_cache = 4;          // if in_cache[i], store i-th key in cache, else store normally
														
 
															+  NodeInfo peer = 5;                   // (optional) sender's own node info, same behavior as in DHT.rpc_ping
														
 
															 }
														
 
															 message StoreResponse {
														
 
															-  repeated bool store_ok = 1;       // for every key, True means store accepted, False means store rejected/failed
														
 
															-  NodeInfo peer = 2;                // respondent's node id, for you to update routing table
														
 
															+  repeated bool store_ok = 1;          // for every key, True means store accepted, False means store rejected/failed
														
 
															+  NodeInfo peer = 2;                   // respondent's node id, for you to update routing table
														
 
															 }
														
 
															 message FindRequest {
														
 
															-  repeated bytes keys = 1;          // a list of DHTID search keys encoded as bytes
														
 
															-  NodeInfo peer = 2;                // optional, same behavior as in DHT.ping
														
 
															+  repeated bytes keys = 1;             // a list of DHTID search keys encoded as bytes
														
 
															+  NodeInfo peer = 2;                   // optional, same behavior as in DHT.ping
														
 
															 }
														
 
															 message Peers {
														
 
															   // two aligned arrays: DHTIDs and Endpoints, i-th endpoint corresponds to peer with i-th node id
														
 
															-  repeated bytes node_ids = 1;       // DHTID serialized with node_id.to_bytes()
														
 
															-  repeated string endpoints = 2;     // e.g. 123.123.123.123:1337 or [2a21:6с8:b192:2105]:8888
														
 
															+  repeated bytes node_ids = 1;         // DHTID serialized with node_id.to_bytes()
														
 
															+  repeated string endpoints = 2;       // e.g. 123.123.123.123:1337 or [2a21:6с8:b192:2105]:8888
														
 
															 }
														
 
															 message FindResponse {
														
 
															-  repeated bytes values = 1;        // value for i-th key, b'' means not found locally
														
 
															-  repeated double expiration = 2;   // expiration time for i-th value, only valid value is found
														
 
															-  repeated Peers nearest = 3;       // peers ordered from nearest to farthest based on distance to i-th key
														
 
															-  NodeInfo peer = 4;                // respondent's node id, for you to update routing table
														
 
															+  repeated bytes values = 1;           // value for i-th key, b'' means not found locally
														
 
															+  repeated double expiration_time = 2; // expiration time for i-th value, only valid value is found
														
 
															+  repeated Peers nearest = 3;          // peers ordered from nearest to farthest based on distance to i-th key
														
 
															+  NodeInfo peer = 4;                   // respondent's node id, for you to update routing table
														
 
															 }
														
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -28,7 +28,7 @@ class DHTNode:
 
															     Compared to Kademlia RPC protocol, hivemind DHT has 3 RPCs:
														
 
															     * ping - request peer's identifier and update routing table (same as Kademlia PING RPC)
														
 
															-    * store - send several (key, value, expiration) pairs to the same peer (like Kademlia STORE, but in bulk)
														
 
															+    * store - send several (key, value, expiration_time) pairs to the same peer (like Kademlia STORE, but in bulk)
														
 
															     * find - request one or several keys, get values & expiration (if peer finds it locally) and :bucket_size: of
														
 
															         nearest peers from recipient's routing table (ordered nearest-to-farthest, not including recipient itself)
														
 
															         This RPC is a mixture between Kademlia FIND_NODE and FIND_VALUE with multiple keys per call.
														
@@ -37,10 +37,10 @@ class DHTNode:
 
															     - when asked to get(key), a node must find and return a value with highest expiration time that it found across DHT
														
 
															       IF that time has not come yet. if expiration time is smaller than current get_dht_time(), node may return None;
														
 
															-    - when requested to store(key: value, expiration), a node must store (key => value) at until expiration time
														
 
															+    - when requested to store(key: value, expiration_time), a node must store (key => value) at until expiration time
														
 
															       or until DHTNode gets the same key with greater expiration time. If a node is asked to store a key but it already
														
 
															       has the same key with newer expiration, the older key will not be stored. Return True if stored, False if refused;
														
 
															-    - when requested to store(key: value, expiration, in_cache=True), stores (key => value) in a separate "cache".
														
 
															+    - when requested to store(key: value, expiration_time, in_cache=True), stores (key => value) in a separate "cache".
														
 
															       Cache operates same as regular storage, but it has a limited size and evicts least recently used nodes when full;
														
 
															     """
														
@@ -191,15 +191,15 @@ class DHTNode:
 
															         store_ok = await self.store_many([key], [value], [expiration_time], **kwargs)
														
 
															         return store_ok[key]
														
 
															-    async def store_many(
														
 
															-            self, keys: List[DHTKey], values: List[DHTValue], expiration: Union[DHTExpiration, List[DHTExpiration]],
														
 
															-            exclude_self: bool = False, await_all_replicas=True, **kwargs) -> Dict[DHTKey, bool]:
														
 
															+    async def store_many(self, keys: List[DHTKey], values: List[DHTValue],
														
 
															+                         expiration_time: Union[DHTExpiration, List[DHTExpiration]],
														
 
															+                         exclude_self: bool = False, await_all_replicas=True, **kwargs) -> Dict[DHTKey, bool]:
														
 
															         """
														
 
															-        Traverse DHT to find up to best nodes to store multiple (key, value, expiration) pairs.
														
 
															+        Traverse DHT to find up to best nodes to store multiple (key, value, expiration_time) pairs.
														
 
															         :param keys: arbitrary serializable keys associated with each value
														
 
															         :param values: serializable "payload" for each key
														
 
															-        :param expiration: either one expiration time for all keys or individual expiration times (see class doc)
														
 
															+        :param expiration_time: either one expiration time for all keys or individual expiration times (see class doc)
														
 
															         :param kwargs: any additional parameters passed to traverse_dht function (e.g. num workers)
														
 
															         :param exclude_self: if True, never store value locally even if you are one of the nearest nodes
														
 
															         :note: if exclude_self is True and self.cache_locally == True, value will still be __cached__ locally
														
@@ -207,13 +207,14 @@ class DHTNode:
 
															             if True, the function will wait for num_replicas successful stores or running out of beam_size nodes
														
 
															         :returns: for each key: True if store succeeds, False if it fails (due to no response or newer value)
														
 
															         """
														
 
															-        expiration = [expiration] * len(keys) if isinstance(expiration, DHTExpiration) else expiration
														
 
															-        assert len(keys) == len(values) == len(expiration), "Please provide equal number of keys, values and expiration"
														
 
															+        if isinstance(expiration_time, DHTExpiration):
														
 
															+            expiration_time = [expiration_time] * len(keys)
														
 
															+        assert len(keys) == len(values) == len(expiration_time), "Number of keys, values and expiration doesn't match."
														
 
															         key_ids = list(map(DHTID.generate, keys))
														
 
															         id_to_original_key = dict(zip(key_ids, keys))
														
 
															         binary_values_by_key_id = {key_id: self.serializer.dumps(value) for key_id, value in zip(key_ids, values)}
														
 
															-        expiration_by_key_id = {key_id: expiration_time for key_id, expiration_time in zip(key_ids, expiration)}
														
 
															+        expiration_by_key_id = {key_id: expiration_time for key_id, expiration_time in zip(key_ids, expiration_time)}
														
 
															         unfinished_key_ids = set(key_ids)  # we use this set to ensure that each store request is finished
														
 
															         store_ok = {key: False for key in keys}  # outputs, updated during search
														
@@ -272,12 +273,16 @@ class DHTNode:
 
															             store_finished_events[id_to_original_key[key_id]].set()
														
 
															-        asyncio.create_task(self.find_nearest_nodes(
														
 
															+        store_task = asyncio.create_task(self.find_nearest_nodes(
														
 
															             queries=set(key_ids), k_nearest=self.num_replicas, node_to_endpoint=node_to_endpoint,
														
 
															             found_callback=on_found, exclude_self=exclude_self, **kwargs))
														
 
															-        await asyncio.wait([evt.wait() for evt in store_finished_events.values()])  # await one (or all) store accepts
														
 
															-        assert len(unfinished_key_ids) == 0, "Internal error: traverse_dht didn't finish search"
														
 
															-        return store_ok
														
 
															+        try:
														
 
															+            await asyncio.wait([evt.wait() for evt in store_finished_events.values()])  # wait for items to be stored
														
 
															+            assert len(unfinished_key_ids) == 0, "Internal error: traverse_dht didn't finish search"
														
 
															+            return store_ok
														
 
															+        except asyncio.CancelledError as e:
														
 
															+            store_task.cancel()
														
 
															+            raise e
														
 
															     async def get(self, key: DHTKey, latest=False, **kwargs) -> Tuple[Optional[DHTValue], Optional[DHTExpiration]]:
														
 
															         """
														
@@ -316,17 +321,17 @@ class DHTNode:
 
															         unfinished_key_ids = set(key_ids)  # track key ids for which the search is not terminated
														
 
															         node_to_endpoint: Dict[DHTID, Endpoint] = dict()  # global routing table for all queries
														
 
															-        SearchResult = namedtuple("SearchResult", ["binary_value", "expiration", "source_node_id"])
														
 
															+        SearchResult = namedtuple("SearchResult", ["binary_value", "expiration_time", "source_node_id"])
														
 
															         latest_results = {key_id: SearchResult(b'', -float('inf'), None) for key_id in key_ids}
														
 
															         # stage 1: value can be stored in our local cache
														
 
															         for key_id in key_ids:
														
 
															-            maybe_value, maybe_expiration = self.protocol.storage.get(key_id)
														
 
															-            if maybe_expiration is None:
														
 
															-                maybe_value, maybe_expiration = self.protocol.cache.get(key_id)
														
 
															-            if maybe_expiration is not None and maybe_expiration > latest_results[key_id].expiration:
														
 
															-                latest_results[key_id] = SearchResult(maybe_value, maybe_expiration, self.node_id)
														
 
															-                if maybe_expiration >= sufficient_expiration_time:
														
 
															+            maybe_value, maybe_expiration_time = self.protocol.storage.get(key_id)
														
 
															+            if maybe_expiration_time is None:
														
 
															+                maybe_value, maybe_expiration_time = self.protocol.cache.get(key_id)
														
 
															+            if maybe_expiration_time is not None and maybe_expiration_time > latest_results[key_id].expiration_time:
														
 
															+                latest_results[key_id] = SearchResult(maybe_value, maybe_expiration_time, self.node_id)
														
 
															+                if maybe_expiration_time >= sufficient_expiration_time:
														
 
															                     unfinished_key_ids.remove(key_id)
														
 
															         # stage 2: traverse the DHT for any unfinished keys
														
@@ -341,11 +346,11 @@ class DHTNode:
 
															                 return {query: ([], False) for query in queries}
														
 
															             output: Dict[DHTID, Tuple[List[DHTID], bool]] = {}
														
 
															-            for key_id, (maybe_value, maybe_expiration, peers) in response.items():
														
 
															+            for key_id, (maybe_value, maybe_expiration_time, peers) in response.items():
														
 
															                 node_to_endpoint.update(peers)
														
 
															-                if maybe_expiration is not None and maybe_expiration > latest_results[key_id].expiration:
														
 
															-                    latest_results[key_id] = SearchResult(maybe_value, maybe_expiration, peer)
														
 
															-                should_interrupt = (latest_results[key_id].expiration >= sufficient_expiration_time)
														
 
															+                if maybe_expiration_time is not None and maybe_expiration_time > latest_results[key_id].expiration_time:
														
 
															+                    latest_results[key_id] = SearchResult(maybe_value, maybe_expiration_time, peer)
														
 
															+                should_interrupt = (latest_results[key_id].expiration_time >= sufficient_expiration_time)
														
 
															                 output[key_id] = list(peers.keys()), should_interrupt
														
 
															             return output
														
@@ -356,10 +361,10 @@ class DHTNode:
 
															         # stage 3: cache any new results depending on caching parameters
														
 
															         for key_id, nearest_nodes in nearest_nodes_per_query.items():
														
 
															-            latest_value_bytes, latest_expiration, latest_node_id = latest_results[key_id]
														
 
															-            should_cache = latest_expiration >= sufficient_expiration_time  # if we found a newer value, cache it
														
 
															+            latest_value_bytes, latest_expiration_time, latest_node_id = latest_results[key_id]
														
 
															+            should_cache = latest_expiration_time >= sufficient_expiration_time  # if we found a newer value, cache it
														
 
															             if should_cache and self.cache_locally:
														
 
															-                self.protocol.cache.store(key_id, latest_value_bytes, latest_expiration)
														
 
															+                self.protocol.cache.store(key_id, latest_value_bytes, latest_expiration_time)
														
 
															             if should_cache and self.cache_nearest:
														
 
															                 num_cached_nodes = 0
														
@@ -367,16 +372,18 @@ class DHTNode:
 
															                     if node_id == latest_node_id:
														
 
															                         continue
														
 
															                     asyncio.create_task(self.protocol.call_store(
														
 
															-                        node_to_endpoint[node_id], [key_id], [latest_value_bytes], [latest_expiration], in_cache=True))
														
 
															+                        node_to_endpoint[node_id], [key_id], [latest_value_bytes], [latest_expiration_time],
														
 
															+                        in_cache=True))
														
 
															                     num_cached_nodes += 1
														
 
															                     if num_cached_nodes >= self.cache_nearest:
														
 
															                         break
														
 
															         # stage 4: deserialize data and assemble function output
														
 
															         find_result: Dict[DHTKey, Tuple[Optional[DHTValue], Optional[DHTExpiration]]] = {}
														
 
															-        for key_id, (latest_value_bytes, latest_expiration, _) in latest_results.items():
														
 
															-            if latest_expiration != -float('inf'):
														
 
															-                find_result[id_to_original_key[key_id]] = self.serializer.loads(latest_value_bytes), latest_expiration
														
 
															+        for key_id, (latest_value_bytes, latest_expiration_time, _) in latest_results.items():
														
 
															+            if latest_expiration_time != -float('inf'):
														
 
															+                latest_value = self.serializer.loads(latest_value_bytes)
														
 
															+                find_result[id_to_original_key[key_id]] = (latest_value, latest_expiration_time)
														
 
															             else:
														
 
															                 find_result[id_to_original_key[key_id]] = None, None
														
 
															         return find_result
														
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -113,15 +113,15 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															         return self.node_info
														
 
															     async def call_store(self, peer: Endpoint, keys: Sequence[DHTID], values: Sequence[BinaryDHTValue],
														
 
															-                         expirations: Union[DHTExpiration, Sequence[DHTExpiration]],
														
 
															+                         expiration_time: Union[DHTExpiration, Sequence[DHTExpiration]],
														
 
															                          in_cache: Optional[Union[bool, Sequence[bool]]] = None) -> Sequence[bool]:
														
 
															         """
														
 
															-        Ask a recipient to store several (key, value : expiration) items or update their older value
														
 
															+        Ask a recipient to store several (key, value : expiration_time) items or update their older value
														
 
															         :param peer: request this peer to store the data
														
 
															         :param keys: a list of N keys digested by DHTID.generate(source=some_dict_key)
														
 
															         :param values: a list of N serialized values (bytes) for each respective key
														
 
															-        :param expirations: a list of N expiration timestamps for each respective key-value pair (see get_dht_time())
														
 
															+        :param expiration_time: a list of N expiration timestamps for each respective key-value pair (see get_dht_time())
														
 
															         :param in_cache: a list of booleans, True = store i-th key in cache, value = store i-th key locally
														
 
															         :note: the difference between storing normally and in cache is that normal storage is guaranteed to be stored
														
 
															          until expiration time (best-effort), whereas cached storage can be evicted early due to limited cache size
														
@@ -129,13 +129,14 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															         :return: list of [True / False] True = stored, False = failed (found newer value or no response)
														
 
															          if peer did not respond (e.g. due to timeout or congestion), returns None
														
 
															         """
														
 
															+        if isinstance(expiration_time, DHTExpiration):
														
 
															+            expiration_time = [expiration_time] * len(keys)
														
 
															         in_cache = in_cache if in_cache is not None else [False] * len(keys)  # default value (None)
														
 
															         in_cache = [in_cache] * len(keys) if isinstance(in_cache, bool) else in_cache  # single bool
														
 
															-        expirations = [expirations] * len(keys) if isinstance(expirations, DHTExpiration) else expirations
														
 
															-        keys, values, expirations, in_cache = map(list, [keys, values, expirations, in_cache])
														
 
															-        assert len(keys) == len(values) == len(expirations) == len(in_cache), "Data is not aligned"
														
 
															+        keys, values, expiration_time, in_cache = map(list, [keys, values, expiration_time, in_cache])
														
 
															+        assert len(keys) == len(values) == len(expiration_time) == len(in_cache), "Data is not aligned"
														
 
															         store_request = dht_pb2.StoreRequest(keys=list(map(DHTID.to_bytes, keys)), values=values,
														
 
															-                                             expiration=expirations, in_cache=in_cache, peer=self.node_info)
														
 
															+                                             expiration_time=expiration_time, in_cache=in_cache, peer=self.node_info)
														
 
															         try:
														
 
															             async with self.rpc_semaphore:
														
 
															                 response = await self._get(peer).rpc_store(store_request, timeout=self.wait_timeout)
														
@@ -152,10 +153,10 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															         """ Some node wants us to store this (key, value) pair """
														
 
															         if request.peer:  # if requested, add peer to the routing table
														
 
															             asyncio.create_task(self.rpc_ping(request.peer, context))
														
 
															-        assert len(request.keys) == len(request.values) == len(request.expiration) == len(request.in_cache)
														
 
															+        assert len(request.keys) == len(request.values) == len(request.expiration_time) == len(request.in_cache)
														
 
															         response = dht_pb2.StoreResponse(store_ok=[], peer=self.node_info)
														
 
															         for key_bytes, value_bytes, expiration_time, in_cache in zip(
														
 
															-                request.keys, request.values, request.expiration, request.in_cache):
														
 
															+                request.keys, request.values, request.expiration_time, request.in_cache):
														
 
															             local_memory = self.cache if in_cache else self.storage
														
 
															             response.store_ok.append(local_memory.store(DHTID.from_bytes(key_bytes), value_bytes, expiration_time))
														
 
															         return response
														
@@ -180,15 +181,16 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															             if response.peer and response.peer.node_id:
														
 
															                 peer_id = DHTID.from_bytes(response.peer.node_id)
														
 
															                 asyncio.create_task(self.update_routing_table(peer_id, peer, responded=True))
														
 
															-            assert len(response.values) == len(response.expiration) == len(response.nearest) == len(keys), \
														
 
															-                "DHTProtocol: response is not aligned with keys"
														
 
															+            assert len(response.values) == len(response.expiration_time) == len(response.nearest) == len(keys), \
														
 
															+                "DHTProtocol: response is not aligned with keys and/or expiration times"
														
 
															             output = {}  # unpack data without special NOT_FOUND_* values
														
 
															-            for key, value, expiration, nearest in zip(keys, response.values, response.expiration, response.nearest):
														
 
															+            for key, value, expiration_time, nearest in zip(
														
 
															+                    keys, response.values, response.expiration_time, response.nearest):
														
 
															                 value = value if value != _NOT_FOUND_VALUE else None
														
 
															-                expiration = expiration if expiration != _NOT_FOUND_EXPIRATION else None
														
 
															+                expiration_time = expiration_time if expiration_time != _NOT_FOUND_EXPIRATION else None
														
 
															                 nearest = dict(zip(map(DHTID.from_bytes, nearest.node_ids), nearest.endpoints))
														
 
															-                output[key] = (value, expiration, nearest)
														
 
															+                output[key] = (value, expiration_time, nearest)
														
 
															             return output
														
 
															         except grpc.experimental.aio.AioRpcError as error:
														
 
															             logger.warning(f"DHTProtocol failed to find at {peer}: {error.code()}")
														
@@ -202,12 +204,12 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															         if request.peer:  # if requested, add peer to the routing table
														
 
															             asyncio.create_task(self.rpc_ping(request.peer, context))
														
 
															-        response = dht_pb2.FindResponse(values=[], expiration=[], nearest=[], peer=self.node_info)
														
 
															+        response = dht_pb2.FindResponse(values=[], expiration_time=[], nearest=[], peer=self.node_info)
														
 
															         for key_id in map(DHTID.from_bytes, request.keys):
														
 
															-            maybe_value, maybe_expiration = self.storage.get(key_id)
														
 
															-            cached_value, cached_expiration = self.cache.get(key_id)
														
 
															-            if (cached_expiration or -float('inf')) > (maybe_expiration or -float('inf')):
														
 
															-                maybe_value, maybe_expiration = cached_value, cached_expiration
														
 
															+            maybe_value, maybe_expiration_time = self.storage.get(key_id)
														
 
															+            cached_value, cached_expiration_time = self.cache.get(key_id)
														
 
															+            if (cached_expiration_time or -float('inf')) > (maybe_expiration_time or -float('inf')):
														
 
															+                maybe_value, maybe_expiration_time = cached_value, cached_expiration_time
														
 
															             nearest_neighbors = self.routing_table.get_nearest_neighbors(
														
 
															                 key_id, k=self.bucket_size, exclude=DHTID.from_bytes(request.peer.node_id))
														
@@ -217,7 +219,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															                 peer_ids, endpoints = [], []
														
 
															             response.values.append(maybe_value if maybe_value is not None else _NOT_FOUND_VALUE)
														
 
															-            response.expiration.append(maybe_expiration if maybe_expiration is not None else _NOT_FOUND_EXPIRATION)
														
 
															+            response.expiration_time.append(maybe_expiration_time if maybe_expiration_time else _NOT_FOUND_EXPIRATION)
														
 
															             response.nearest.append(dht_pb2.Peers(node_ids=list(map(DHTID.to_bytes, peer_ids)), endpoints=endpoints))
														
 
															         return response
														
@@ -235,7 +237,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															             if node_id not in self.routing_table:
														
 
															                 # we just met a new node, maybe we know some values that it *should* store
														
 
															                 data_to_send: List[Tuple[DHTID, BinaryDHTValue, DHTExpiration]] = []
														
 
															-                for key, value, expiration in list(self.storage.items()):
														
 
															+                for key, value, expiration_time in list(self.storage.items()):
														
 
															                     neighbors = self.routing_table.get_nearest_neighbors(key, self.num_replicas, exclude=self.node_id)
														
 
															                     if neighbors:
														
 
															                         nearest_distance = neighbors[0][0].xor_distance(key)
														
@@ -243,7 +245,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
															                         new_node_should_store = node_id.xor_distance(key) < farthest_distance
														
 
															                         this_node_is_responsible = self.node_id.xor_distance(key) < nearest_distance
														
 
															                     if not neighbors or (new_node_should_store and this_node_is_responsible):
														
 
															-                        data_to_send.append((key, value, expiration))
														
 
															+                        data_to_send.append((key, value, expiration_time))
														
 
															                 if data_to_send:
														
 
															                     asyncio.create_task(self.call_store(peer_endpoint, *zip(*data_to_send), in_cache=False))
														
@@ -262,7 +264,7 @@ _NOT_FOUND_VALUE, _NOT_FOUND_EXPIRATION = b'', -float('inf')  # internal values
 
															 class LocalStorage:
														
 
															-    """ Local dictionary that maintains up to :maxsize: tuples of (key, value, expiration) """
														
 
															+    """ Local dictionary that maintains up to :maxsize: tuples of (key, value, expiration_time) """
														
 
															     def __init__(self, maxsize: Optional[int] = None):
														
 
															         self.cache_size = maxsize or float("inf")
														
@@ -306,4 +308,4 @@ class LocalStorage:
 
															     def items(self) -> Iterator[Tuple[DHTID, BinaryDHTValue, DHTExpiration]]:
														
 
															         """ Iterate over (key, value, expiration_time) tuples stored in this storage """
														
 
															         self.remove_outdated()
														
 
															-        return ((key, value, expiration) for key, (value, expiration) in self.data.items())
														
 
															+        return ((key, value, expiration_time) for key, (value, expiration_time) in self.data.items())
														
--- a/hivemind/dht/traverse.py
+++ b/hivemind/dht/traverse.py
@@ -215,16 +215,22 @@ async def traverse_dht(
 
															             active_workers.subtract(queries_to_call)
														
 
															             heap_updated_event.set()
														
 
															-    # spawn all workers and wait for them to terminate; workers terminate after exhausting unfinished_queries
														
 
															-    await asyncio.wait([asyncio.create_task(worker()) for _ in range(num_workers)],
														
 
															-                       return_when=asyncio.FIRST_COMPLETED)  # first worker finishes when the search is over
														
 
															-    assert len(unfinished_queries) == 0 and search_finished_event.is_set()
														
 
															-
														
 
															-    if await_all_tasks:
														
 
															-        await asyncio.gather(*pending_tasks)
														
 
															-
														
 
															-    nearest_neighbors_per_query = {
														
 
															-        query: [peer for _, peer in heapq.nlargest(beam_size, nearest_nodes[query])]
														
 
															-        for query in queries
														
 
															-    }
														
 
															-    return nearest_neighbors_per_query, visited_nodes
														
 
															+    workers = [asyncio.create_task(worker()) for _ in range(num_workers)]
														
 
															+    try:
														
 
															+        # spawn all workers and wait for them to terminate; workers terminate after exhausting unfinished_queries
														
 
															+        await asyncio.wait(workers, return_when=asyncio.FIRST_COMPLETED)
														
 
															+        assert len(unfinished_queries) == 0 and search_finished_event.is_set()
														
 
															+
														
 
															+        if await_all_tasks:
														
 
															+            await asyncio.gather(*pending_tasks)
														
 
															+
														
 
															+        nearest_neighbors_per_query = {
														
 
															+            query: [peer for _, peer in heapq.nlargest(beam_size, nearest_nodes[query])]
														
 
															+            for query in queries
														
 
															+        }
														
 
															+        return nearest_neighbors_per_query, visited_nodes
														
 
															+
														
 
															+    except asyncio.CancelledError as e:
														
 
															+        for worker in workers:
														
 
															+            worker.cancel()
														
 
															+        raise e
														
--- a/hivemind/server/connection_handler.py
+++ b/hivemind/server/connection_handler.py
@@ -59,13 +59,11 @@ class ConnectionHandler(mp.Process):
 
															     async def forward(self, request: runtime_pb2.ExpertRequest, context: grpc.ServicerContext):
														
 
															         inputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															         future = self.experts[request.uid].forward_pool.submit_task(*inputs)
														
 
															-        response = await future.async_result()
														
 
															-        serialized_response = [serialize_torch_tensor(tensor) for tensor in response]
														
 
															+        serialized_response = [serialize_torch_tensor(tensor) for tensor in await future]
														
 
															         return runtime_pb2.ExpertResponse(tensors=serialized_response)
														
 
															     async def backward(self, request: runtime_pb2.ExpertRequest, context: grpc.ServicerContext):
														
 
															         inputs_and_grad_outputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															         future = self.experts[request.uid].backward_pool.submit_task(*inputs_and_grad_outputs)
														
 
															-        response = await future.async_result()
														
 
															-        serialized_response = [serialize_torch_tensor(tensor) for tensor in response]
														
 
															+        serialized_response = [serialize_torch_tensor(tensor) for tensor in await future]
														
 
															         return runtime_pb2.ExpertResponse(tensors=serialized_response)
														
--- a/hivemind/utils/mpfuture.py
+++ b/hivemind/utils/mpfuture.py
@@ -1,21 +1,25 @@
 
															+from __future__ import annotations
														
 
															+import time
														
 
															 import multiprocessing as mp
														
 
															 import multiprocessing.connection
														
 
															-from concurrent.futures import Future, CancelledError
														
 
															-from warnings import warn
														
 
															+import concurrent.futures._base as base
														
 
															+
														
 
															 import asyncio
														
 
															+from functools import lru_cache
														
 
															+from typing import Optional
														
 
															+
														
 
															+from hivemind.utils.threading import run_in_background
														
 
															-class MPFuture(Future):
														
 
															-    """ Multiprocessing version of concurrent.futures.Future, interacts between two processes via Pipe """
														
 
															-    STATES = 'pending', 'running', 'cancelled', 'finished', 'exception'
														
 
															-    STATE_PENDING, STATE_RUNNING, STATE_CANCELLED, STATE_FINISHED, STATE_EXCEPTION = STATES
														
 
															+class MPFuture(base.Future):
														
 
															+    """ Multiprocessing version of concurrent.futures.Future. Can also be awaited like asyncio.Future """
														
 
															+
														
 
															+    TERMINAL_STATES = {base.FINISHED, base.CANCELLED, base.CANCELLED_AND_NOTIFIED}
														
 
															     def __init__(self, connection: mp.connection.Connection):
														
 
															         """ manually create MPFuture. Please use MPFuture.make_pair instead """
														
 
															+        self._state, self._result, self._exception = base.PENDING, None, None
														
 
															         self.connection = connection
														
 
															-        self.state = self.STATE_PENDING
														
 
															-        self._result = None
														
 
															-        self._exception = None
														
 
															     @classmethod
														
 
															     def make_pair(cls):
														
@@ -23,125 +27,138 @@ class MPFuture(Future):
 
															         connection1, connection2 = mp.Pipe()
														
 
															         return cls(connection1), cls(connection2)
														
 
															-    def poll_and_recv(self, timeout):
														
 
															-        available = self.connection.poll(timeout)
														
 
															-        if not available:
														
 
															-            raise TimeoutError
														
 
															-        try:
														
 
															-            status, payload = self.connection.recv()
														
 
															-            self.connection.close()
														
 
															-        except BrokenPipeError as e:
														
 
															-            status, payload = self.STATE_EXCEPTION, e
														
 
															-        return status, payload
														
 
															-
														
 
															-    def _recv(self, timeout):
														
 
															-
														
 
															-        if self.state in (self.STATE_PENDING, self.STATE_RUNNING):
														
 
															-            status, payload = self.poll_and_recv(timeout)
														
 
															-
														
 
															-            assert status in self.STATES
														
 
															-            self.state = status
														
 
															-
														
 
															-            if status == self.STATE_FINISHED:
														
 
															-                self._result = payload
														
 
															-            elif status == self.STATE_EXCEPTION:
														
 
															-                self._exception = payload
														
 
															-            elif status in (self.STATE_RUNNING, self.STATE_CANCELLED):
														
 
															-                pass  # only update self.state
														
 
															-            else:
														
 
															-                raise ValueError("Result status should not be self.STATE_PENDING")
														
 
															-
														
 
															-    def set_result(self, result):
														
 
															+    def _send_updates(self):
														
 
															+        """ Send updates to a paired MPFuture """
														
 
															         try:
														
 
															-            self.state, self._result = self.STATE_FINISHED, result
														
 
															-            self.connection.send((self.STATE_FINISHED, result))
														
 
															-            self.connection.close()
														
 
															+            self.connection.send((self._state, self._result, self._exception))
														
 
															+            if self._state in self.TERMINAL_STATES:
														
 
															+                self._shutdown_trigger.set_result(True)
														
 
															+                self.connection.close()
														
 
															             return True
														
 
															         except BrokenPipeError:
														
 
															             return False
														
 
															-    def set_exception(self, exception: BaseException):
														
 
															+    def _recv_updates(self, timeout: Optional[float]):
														
 
															+        """ Await updates from a paired MPFuture """
														
 
															         try:
														
 
															-            self.state, self._exception = self.STATE_EXCEPTION, exception
														
 
															-            self.connection.send((self.STATE_EXCEPTION, exception))
														
 
															-            self.connection.close()
														
 
															-            return True
														
 
															-        except BrokenPipeError:
														
 
															-            return False
														
 
															+            future = base.wait([run_in_background(self.connection.poll, timeout), self._shutdown_trigger],
														
 
															+                               return_when=base.FIRST_COMPLETED)[0].pop()
														
 
															+            if future is self._shutdown_trigger:
														
 
															+                raise BrokenPipeError()
														
 
															+            if not future.result():
														
 
															+                raise TimeoutError()
														
 
															+            self._state, result, exception = self.connection.recv()
														
 
															+            self._result = result if result is not None else self._result
														
 
															+            self._exception = exception if exception is not None else self._exception
														
 
															+            if self._state in self.TERMINAL_STATES:
														
 
															+                self.connection.close()
														
 
															+        except TimeoutError as e:
														
 
															+            raise e
														
 
															+        except (BrokenPipeError, OSError) as e:
														
 
															+            if self._state in (base.PENDING, base.RUNNING):
														
 
															+                self._state, self._exception = base.FINISHED, e
														
 
															+
														
 
															+    def _await_terminal_state(self, timeout: Optional[float]):
														
 
															+        """ Await updates until future is either finished, cancelled or got an exception """
														
 
															+        time_left = float('inf') if timeout is None else timeout
														
 
															+        time_before = time.monotonic()
														
 
															+        while self._state not in self.TERMINAL_STATES and time_left > 0:
														
 
															+            self._recv_updates(time_left if timeout else None)
														
 
															+            time_spent = time.monotonic() - time_before
														
 
															+            time_left, time_before = time_left - time_spent, time_before + time_spent
														
 
															+
														
 
															+    def _sync_updates(self):
														
 
															+        """ Apply queued updates from a paired MPFuture without waiting for new ones """
														
 
															+        try:
														
 
															+            self._recv_updates(timeout=0)
														
 
															+        except TimeoutError:
														
 
															+            pass
														
 
															+
														
 
															+    def set_result(self, result):
														
 
															+        self._sync_updates()
														
 
															+        if self._state in self.TERMINAL_STATES:
														
 
															+            raise RuntimeError(f"Can't set_result to a future that is in {self._state}")
														
 
															+        self._state, self._result = base.FINISHED, result
														
 
															+        return self._send_updates()
														
 
															+
														
 
															+    def set_exception(self, exception: BaseException):
														
 
															+        self._sync_updates()
														
 
															+        if self._state in self.TERMINAL_STATES:
														
 
															+            raise RuntimeError(f"Can't set_exception to a future that is in {self._state}")
														
 
															+        self._state, self._exception = base.FINISHED, exception
														
 
															+        self._send_updates()
														
 
															     def set_running_or_notify_cancel(self):
														
 
															-        return True
														
 
															+        self._sync_updates()
														
 
															+        if self._state == base.PENDING:
														
 
															+            self._state = base.RUNNING
														
 
															+            return self._send_updates()
														
 
															+        elif self._state == base.CANCELLED:
														
 
															+            return False
														
 
															+        else:
														
 
															+            raise RuntimeError(f"Can't set_running_or_notify_cancel to a future that is in {self._state}")
														
 
															     def cancel(self):
														
 
															-        raise NotImplementedError()
														
 
															+        self._sync_updates()
														
 
															+        if self._state in self.TERMINAL_STATES:
														
 
															+            return False
														
 
															+        self._state, self._exception = base.CANCELLED, base.CancelledError()
														
 
															+        return self._send_updates()
														
 
															     def result(self, timeout=None):
														
 
															-        self._recv(timeout)
														
 
															-        if self.state == self.STATE_FINISHED:
														
 
															-            return self._result
														
 
															-        elif self.state == self.STATE_EXCEPTION:
														
 
															+        self._await_terminal_state(timeout)
														
 
															+        if self._exception is not None:
														
 
															             raise self._exception
														
 
															-        else:
														
 
															-            assert self.state == self.STATE_CANCELLED
														
 
															-            raise CancelledError()
														
 
															+        return self._result
														
 
															     def exception(self, timeout=None):
														
 
															-        self._recv(timeout)
														
 
															+        self._await_terminal_state(timeout)
														
 
															+        if self._state == base.CANCELLED:
														
 
															+            raise base.CancelledError()
														
 
															         return self._exception
														
 
															     def done(self):
														
 
															-        return self.state in (self.STATE_FINISHED, self.STATE_EXCEPTION, self.STATE_CANCELLED)
														
 
															+        self._sync_updates()
														
 
															+        return self._state in self.TERMINAL_STATES
														
 
															     def running(self):
														
 
															-        return self.state == self.STATE_RUNNING
														
 
															+        self._sync_updates()
														
 
															+        return self._state == base.RUNNING
														
 
															     def cancelled(self):
														
 
															-        warn("cancelled not implemented")
														
 
															-        return False
														
 
															+        self._sync_updates()
														
 
															+        return self._state == base.CANCELLED
														
 
															     def add_done_callback(self, callback):
														
 
															-        raise NotImplementedError()
														
 
															-
														
 
															-    def __repr__(self):
														
 
															-        try:
														
 
															-            self._recv(timeout=0)
														
 
															-        except TimeoutError:
														
 
															-            pass
														
 
															-        if self.state == self.STATE_FINISHED:
														
 
															-            return "<MPFuture at 0x{:x} state=finished returned {}>".format(id(self), type(self._result))
														
 
															-        elif self.state == self.STATE_EXCEPTION:
														
 
															-            return "<MPFuture at 0x{:x} state=finished raised {}>".format(id(self), type(self._exception))
														
 
															-        else:
														
 
															-            return "<MPFuture at 0x{:x} state={}>".format(id(self), self.state)
														
 
															+        raise NotImplementedError(f"MPFuture doesn't support callbacks.")
														
 
															-    async def _async_recv(self, timeout):
														
 
															-        loop = asyncio.get_running_loop()
														
 
															+    def remove_done_callback(self, callback):
														
 
															+        raise NotImplementedError(f"MPFuture doesn't support callbacks.")
														
 
															-        if self.state in (self.STATE_PENDING, self.STATE_RUNNING):
														
 
															-            status, payload = await loop.run_in_executor(None, self.poll_and_recv, timeout)
														
 
															+    def get_loop(self):
														
 
															+        raise NotImplementedError(f"MPFuture doesn't support get_loop")
														
 
															-            assert status in self.STATES
														
 
															-            self.state = status
														
 
															+    @property
														
 
															+    @lru_cache()
														
 
															+    def _shutdown_trigger(self):
														
 
															+        return base.Future()
														
 
															-            if status == self.STATE_FINISHED:
														
 
															-                self._result = payload
														
 
															-            elif status == self.STATE_EXCEPTION:
														
 
															-                self._exception = payload
														
 
															-            elif status in (self.STATE_RUNNING, self.STATE_CANCELLED):
														
 
															-                pass  # only update self.state
														
 
															+    def __repr__(self):
														
 
															+        self._sync_updates()
														
 
															+        if self._state == base.FINISHED:
														
 
															+            if self._exception:
														
 
															+                return "<MPFuture at 0x{:x} state=finished raised {}>".format(id(self), type(self._exception))
														
 
															             else:
														
 
															-                raise ValueError("Result status should not be self.STATE_PENDING")
														
 
															+                return "<MPFuture at 0x{:x} state=finished returned {}>".format(id(self), type(self._result))
														
 
															+        else:
														
 
															+            return "<MPFuture at 0x{:x} state={}>".format(id(self), self._state)
														
 
															-    async def async_result(self, timeout=None):
														
 
															-        await self._async_recv(timeout)
														
 
															-        if self.state == self.STATE_FINISHED:
														
 
															-            return self._result
														
 
															-        elif self.state == self.STATE_EXCEPTION:
														
 
															+    def __await__(self):
														
 
															+        yield from asyncio.get_running_loop().run_in_executor(None, self._await_terminal_state, None).__await__()
														
 
															+        if self._exception:
														
 
															             raise self._exception
														
 
															-        else:
														
 
															-            assert self.state == self.STATE_CANCELLED
														
 
															-            raise CancelledError()
														
 
															+        return self._result
														
 
															-    async def async_exception(self, timeout=None):
														
 
															-        await self._async_recv(timeout)
														
 
															-        return self._exception
														
 
															+    def __del__(self):
														
 
															+        self._shutdown_trigger.set_result(True)
														
 
															+        self.connection.close()
														
--- a/hivemind/utils/threading.py
+++ b/hivemind/utils/threading.py
@@ -1,27 +1,20 @@
 
															 import os
														
 
															-from concurrent.futures import Future, ThreadPoolExecutor, as_completed, TimeoutError
														
 
															+from concurrent.futures import Future, as_completed, TimeoutError, ThreadPoolExecutor
														
 
															 import time
														
 
															 from typing import Optional, List
														
 
															-GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=os.environ.get("HIVEMIND_THREADS", float('inf')))
														
 
															+EXECUTOR_PID, GLOBAL_EXECUTOR = None, None
														
 
															 def run_in_background(func: callable, *args, **kwargs) -> Future:
														
 
															     """ run func(*args, **kwargs) in background and return Future for its outputs """
														
 
															-
														
 
															+    global EXECUTOR_PID, GLOBAL_EXECUTOR
														
 
															+    if os.getpid() != EXECUTOR_PID:
														
 
															+        GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=os.environ.get("HIVEMIND_THREADS", float('inf')))
														
 
															+        EXECUTOR_PID = os.getpid()
														
 
															     return GLOBAL_EXECUTOR.submit(func, *args, **kwargs)
														
 
															-def run_forever(func: callable, *args, **kwargs):
														
 
															-    """ A function that runs a :func: in background forever. Returns a future that catches exceptions """
														
 
															-
														
 
															-    def repeat():
														
 
															-        while True:
														
 
															-            func(*args, **kwargs)
														
 
															-
														
 
															-    return run_in_background(repeat)
														
 
															-
														
 
															-
														
 
															 def run_and_await_k(jobs: List[callable], k: int,
														
 
															                     timeout_after_k: Optional[float] = 0, timeout_total: Optional[float] = None):
														
 
															     """
														
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -15,15 +15,15 @@ def random_endpoint() -> hivemind.Endpoint:
 
															 def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_batch_size: int, random_seed: int,
														
 
															-                  wait_after_request: float, wait_before_read: float, wait_timeout: float, expiration_time: float):
														
 
															-    old_expiration_time, hivemind.DHT.EXPIRATION = hivemind.DHT.EXPIRATION, expiration_time
														
 
															+                  wait_after_request: float, wait_before_read: float, wait_timeout: float, expiration: float):
														
 
															     random.seed(random_seed)
														
 
															     print("Creating peers...")
														
 
															     peers = []
														
 
															     for _ in trange(num_peers):
														
 
															         neighbors = [f'0.0.0.0:{node.port}' for node in random.sample(peers, min(initial_peers, len(peers)))]
														
 
															-        peer = hivemind.DHT(initial_peers=neighbors, start=True, wait_timeout=wait_timeout, listen_on=f'0.0.0.0:*')
														
 
															+        peer = hivemind.DHT(initial_peers=neighbors, start=True, wait_timeout=wait_timeout,
														
 
															+                            expiration=expiration, listen_on=f'0.0.0.0:*')
														
 
															         peers.append(peer)
														
 
															     store_peer, get_peer = peers[-2:]
														
@@ -52,7 +52,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
															     print(f"Mean store time: {total_store_time / total_stores:.5}, Total: {total_store_time:.5}")
														
 
															     time.sleep(wait_before_read)
														
 
															-    if time.perf_counter() - benchmark_started > expiration_time:
														
 
															+    if time.perf_counter() - benchmark_started > expiration:
														
 
															         warn("Warning: all keys expired before benchmark started getting them. Consider increasing expiration_time")
														
 
															     successful_gets = total_get_time = 0
														
@@ -67,7 +67,7 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
															                     and expert.endpoint == endpoints[start // expert_batch_size]:
														
 
															                 successful_gets += 1
														
 
															-    if time.perf_counter() - benchmark_started > expiration_time:
														
 
															+    if time.perf_counter() - benchmark_started > expiration:
														
 
															         warn("Warning: keys expired midway during get requests. If that is not desired, increase expiration_time param")
														
 
															     print(f"Get success rate: {successful_gets / len(expert_uids) * 100:.1f} ({successful_gets} / {len(expert_uids)})")
														
@@ -75,7 +75,6 @@ def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_b
 
															     alive_peers = [peer.is_alive() for peer in peers]
														
 
															     print(f"Node survival rate: {len(alive_peers) / len(peers) * 100:.3f}%")
														
 
															-    hivemind.DHT.EXPIRATION = old_expiration_time
														
 
															 if __name__ == "__main__":
														
@@ -84,7 +83,7 @@ if __name__ == "__main__":
 
															     parser.add_argument('--initial_peers', type=int, default=1, required=False)
														
 
															     parser.add_argument('--num_experts', type=int, default=256, required=False)
														
 
															     parser.add_argument('--expert_batch_size', type=int, default=32, required=False)
														
 
															-    parser.add_argument('--expiration_time', type=float, default=300, required=False)
														
 
															+    parser.add_argument('--expiration', type=float, default=300, required=False)
														
 
															     parser.add_argument('--wait_after_request', type=float, default=0, required=False)
														
 
															     parser.add_argument('--wait_before_read', type=float, default=0, required=False)
														
 
															     parser.add_argument('--wait_timeout', type=float, default=5, required=False)
														
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -5,7 +5,7 @@ import random
 
															 import heapq
														
 
															 import uuid
														
 
															 from itertools import chain
														
 
															-from typing import Optional, Tuple
														
 
															+from typing import Optional
														
 
															 import numpy as np
														
 
															 import hivemind
														
@@ -249,7 +249,7 @@ def test_dht_node():
 
															         # test 7: bulk store and bulk get
														
 
															         keys = 'foo', 'bar', 'baz', 'zzz'
														
 
															         values = 3, 2, 'batman', [1, 2, 3]
														
 
															-        store_ok = loop.run_until_complete(me.store_many(keys, values, expiration=get_dht_time() + 999))
														
 
															+        store_ok = loop.run_until_complete(me.store_many(keys, values, expiration_time=get_dht_time() + 999))
														
 
															         assert all(store_ok.values()), "failed to store one or more keys"
														
 
															         response = loop.run_until_complete(me.get_many(keys[::-1]))
														
 
															         for key, value in zip(keys, values):
														
--- a/tests/test_util_modules.py
+++ b/tests/test_util_modules.py
@@ -0,0 +1,116 @@
 
															+import asyncio
														
 
															+
														
 
															+import pytest
														
 
															+import hivemind
														
 
															+
														
 
															+from concurrent.futures import CancelledError
														
 
															+
														
 
															+
														
 
															+def test_mpfuture_result():
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    f1.set_result(321)
														
 
															+    assert f2.result() == 321
														
 
															+    assert f1.result() == 321
														
 
															+
														
 
															+    for future in [f1, f2]:
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.set_result(123)
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.set_exception(ValueError())
														
 
															+        assert future.cancel() is False
														
 
															+        assert future.done() and not future.running() and not future.cancelled()
														
 
															+
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    with pytest.raises(TimeoutError):
														
 
															+        f1.result(timeout=1e-3)
														
 
															+
														
 
															+    f2.set_result(['abacaba', 123])
														
 
															+    assert f1.result() == ['abacaba', 123]
														
 
															+
														
 
															+
														
 
															+def test_mpfuture_exception():
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    with pytest.raises(TimeoutError):
														
 
															+        f1.exception(timeout=1e-3)
														
 
															+
														
 
															+    f2.set_exception(NotImplementedError())
														
 
															+
														
 
															+    for future in [f1, f2]:
														
 
															+        assert isinstance(future.exception(), NotImplementedError)
														
 
															+        with pytest.raises(NotImplementedError):
														
 
															+            future.result()
														
 
															+        assert future.cancel() is False
														
 
															+        assert future.done() and not future.running() and not future.cancelled()
														
 
															+
														
 
															+
														
 
															+def test_mpfuture_cancel():
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    assert not f2.cancelled()
														
 
															+    f1.cancel()
														
 
															+    for future in [f1, f2]:
														
 
															+        with pytest.raises(CancelledError):
														
 
															+            future.result()
														
 
															+        with pytest.raises(CancelledError):
														
 
															+            future.exception()
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.set_result(123)
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.set_exception(NotImplementedError)
														
 
															+        assert future.cancelled() and future.done() and not future.running()
														
 
															+
														
 
															+
														
 
															+def test_mpfuture_status():
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    assert f1.set_running_or_notify_cancel() is True
														
 
															+    for future in [f1, f2]:
														
 
															+        assert future.running() and not future.done() and not future.cancelled()
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.set_running_or_notify_cancel()
														
 
															+    f2.cancel()
														
 
															+    for future in [f1, f2]:
														
 
															+        assert not future.running() and future.done() and future.cancelled()
														
 
															+        assert future.set_running_or_notify_cancel() is False
														
 
															+
														
 
															+    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    f1.cancel()
														
 
															+    for future in [f1, f2]:
														
 
															+        assert future.set_running_or_notify_cancel() is False
														
 
															+
														
 
															+
														
 
															+def test_await_mpfuture():
														
 
															+    async def _run():
														
 
															+        # await result
														
 
															+        f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+        async def wait_and_assign():
														
 
															+            assert f2.set_running_or_notify_cancel() is True
														
 
															+            await asyncio.sleep(0.1)
														
 
															+            f2.set_result((123, 'ololo'))
														
 
															+
														
 
															+        asyncio.create_task(wait_and_assign())
														
 
															+        for future in [f1, f2]:
														
 
															+            res = await future
														
 
															+            assert res == (123, 'ololo')
														
 
															+
														
 
															+        # await cancel
														
 
															+        f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+        async def wait_and_cancel():
														
 
															+            await asyncio.sleep(0.1)
														
 
															+            f1.cancel()
														
 
															+
														
 
															+        asyncio.create_task(wait_and_cancel())
														
 
															+        for future in [f1, f2]:
														
 
															+            with pytest.raises(CancelledError):
														
 
															+                await future
														
 
															+
														
 
															+        # await exception
														
 
															+        f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+        async def wait_and_raise():
														
 
															+            await asyncio.sleep(0.1)
														
 
															+            f1.set_exception(SystemError())
														
 
															+
														
 
															+        asyncio.create_task(wait_and_raise())
														
 
															+        for future in [f1, f2]:
														
 
															+            with pytest.raises(SystemError):
														
 
															+                await future
														
 
															+
														
 
															+    asyncio.new_event_loop().run_until_complete(_run())
														
--- a/tests/test_utils/run_server.py
+++ b/tests/test_utils/run_server.py
@@ -14,7 +14,7 @@ from test_utils.layers import name_to_block, name_to_input
 
															 def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hidden_dim=1024,
														
 
															                       num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None,
														
 
															                       no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, root_port=None, verbose=True,
														
 
															-                      UID_DELIMETER=hivemind.DHT.UID_DELIMETER, start=False, **kwargs) -> hivemind.Server:
														
 
															+                      start=False, **kwargs) -> hivemind.Server:
														
 
															     """
														
 
															     Instantiate a server with several identical experts. See argparse comments below for details
														
 
															     :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
														
@@ -47,9 +47,8 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
															     if not no_dht:
														
 
															         if not len(initial_peers):
														
 
															             print("No initial peers provided. Starting additional dht as an initial peer.")
														
 
															-            dht_root = hivemind.DHT(initial_peers=initial_peers,
														
 
															-                                    listen_on=f"{hivemind.LOCALHOST}:{root_port or hivemind.find_open_port()}",
														
 
															-                                    start=True)
														
 
															+            dht_root = hivemind.DHT(initial_peers=initial_peers, start=True,
														
 
															+                                    listen_on=f"{hivemind.LOCALHOST}:{root_port or hivemind.find_open_port()}")
														
 
															             print(f"Initializing DHT with port {dht_root.port}")
														
 
															             initial_peers = [f"{hivemind.LOCALHOST}:{dht_root.port}"]
														
 
															         else:
														
@@ -57,9 +56,8 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
															             if root_port is not None:
														
 
															                 print(f"Warning: root_port={root_port} will not be used since we already have peers.")
														
 
															-        dht = hivemind.DHT(initial_peers=initial_peers,
														
 
															-                           listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}",
														
 
															-                           start=True)
														
 
															+        dht = hivemind.DHT(initial_peers=initial_peers, start=True,
														
 
															+                           listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}")
														
 
															         if verbose:
														
 
															             print(f"Running dht node on port {dht.port}")
														
@@ -74,7 +72,7 @@ def make_dummy_server(listen_on='0.0.0.0:*', num_experts=1, expert_cls='ffn', hi
 
															     for i in range(num_experts):
														
 
															         expert = name_to_block[expert_cls](hidden_dim)
														
 
															         opt = torch.optim.SGD(expert.parameters(), 0.0 if no_optimizer else 0.05)
														
 
															-        expert_uid = f'{expert_prefix}{UID_DELIMETER}{i + expert_offset}'
														
 
															+        expert_uid = f'{expert_prefix}{hivemind.DHT.UID_DELIMITER}{i + expert_offset}'
														
 
															         experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, opt=opt,
														
 
															                                                      args_schema=args_schema,
														
 
															                                                      outputs_schema=hivemind.BatchTensorDescriptor(hidden_dim),
														
@@ -154,12 +152,12 @@ if __name__ == '__main__':
 
															     parser.add_argument('--no_optimizer', action='store_true', help='if specified, all optimizers use learning rate=0')
														
 
															     parser.add_argument('--no_dht', action='store_true', help='if specified, the server will not be attached to a dht')
														
 
															     parser.add_argument('--initial_peers', type=str, default="[]", required=False, help='a list of peers that will'
														
 
															-                                                                                        ' introduce this node to the dht, e.g. [("1.2.3.4", 1337), ("127.0.0.1", 4321)]')
														
 
															+                        ' introduce this node to the dht, e.g. [("1.2.3.4", 1337), ("127.0.0.1", 4321)]')
														
 
															     parser.add_argument('--dht_port', type=int, default=None, required=False, help='DHT node will listen on this port')
														
 
															     parser.add_argument('--root_port', type=int, default=None, required=False, help='If this server does not have peers'
														
 
															-                                                                                    ', it will create a virtual dht node on this port. You can then use this node as initial peer.')
														
 
															+                        ', it will create a virtual dht node on this port. You can then use this node as initial peer.')
														
 
															     parser.add_argument('--increase_file_limit', action='store_true', help='On *nix, this will increase the max number'
														
 
															-                                                                           ' of processes a server can spawn before hitting "Too many open files"; Use at your own risk.')
														
 
															+                        ' of processes a server can spawn before hitting "Too many open files"; Use at your own risk.')
														
 
															     args = vars(parser.parse_args())