4 жил өмнө · 200fbecdbf
--- a/benchmarks/benchmark_averaging.py
+++ b/benchmarks/benchmark_averaging.py
@@ -6,7 +6,7 @@ import argparse
 
															 import torch
														
 
															 import hivemind
														
 
															-from hivemind.utils import LOCALHOST, increase_file_limit, get_logger
														
 
															+from hivemind.utils import LOCALHOST, get_logger, increase_file_limit
														
 
															 from hivemind.proto import runtime_pb2
														
--- a/benchmarks/benchmark_dht.py
+++ b/benchmarks/benchmark_dht.py
@@ -6,7 +6,7 @@ from tqdm import trange
 
															 import hivemind
														
 
															 import hivemind.server.expert_uid
														
 
															-from hivemind.utils.threading import increase_file_limit
														
 
															+from hivemind.utils.limits import increase_file_limit
														
 
															 logger = hivemind.get_logger(__name__)
														
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -9,7 +9,7 @@ import torch
 
															 import hivemind
														
 
															 from hivemind import find_open_port
														
 
															 from hivemind.server import layers
														
 
															-from hivemind.utils.threading import increase_file_limit
														
 
															+from hivemind.utils.limits import increase_file_limit
														
 
															 from hivemind.utils.logging import get_logger
														
--- a/examples/albert/README.md
+++ b/examples/albert/README.md
@@ -40,7 +40,7 @@ wandb: Run `wandb offline` to turn off syncing.
 
															   - if necessary, specify paths: `--dataset_path ./path/to/unpacked/data --tokenizer ./path/to/tokenizer/config` (see [default paths](https://github.com/learning-at-home/hivemind/blob/collaborative_albert_example/examples/albert/run_trainer.py#L63-L69) for reference)
														
 
															   - run:
														
 
															 ```shell
														
 
															-HIVEMIND_THREADS=64 python run_trainer.py \
														
 
															+python run_trainer.py \
														
 
															  --experiment_prefix SAME_AS_IN_RUN_FIRST_PEER --initial_peers ONE_OR_MORE_PEERS --seed 42 \
														
 
															  --logging_first_step --logging_steps 100  --output_dir ./outputs --overwrite_output_dir --logging_dir ./logs
														
 
															 ```
														
@@ -88,7 +88,7 @@ Here's an example of a full trainer script for Google Colab:
 
															 !pip install transformers datasets sentencepiece torch_optimizer==0.1.0
														
 
															 !git clone https://github.com/learning-at-home/hivemind && cd hivemind && pip install -e .
														
 
															 !curl -L YOUR_HOSTED_DATA | tar xzf -     # example: https://hivemind-data.s3.us-east-2.amazonaws.com/wikitext103.tar.gz
														
 
															-!ulimit -n 4096 && HIVEMIND_THREADS=256 python ./hivemind/examples/albert/run_trainer.py \
														
 
															+!ulimit -n 4096 && python ./hivemind/examples/albert/run_trainer.py \
														
 
															  --client_mode --initial_peers ONE_OR_MORE_PEERS  --averaging_expiration 10 \
														
 
															  --batch_size_lead 300 --per_device_train_batch_size 4 --gradient_accumulation_steps 1 \
														
 
															  --logging_first_step --logging_steps 100  --output_dir ./outputs --overwrite_output_dir --logging_dir ./logs \
														
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -290,9 +290,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															             weight = float(self.mode != AveragingMode.AUX)
														
 
															         assert isinstance(weight, (int, float)) and weight >= 0, f"Expected a positive int/float, got {type(weight)}"
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															+        future = MPFuture()
														
 
															         gather_binary = self.serializer.dumps(gather)  # serialize here to avoid loading modules in the averager process
														
 
															-        self._outer_pipe.send(('_step', [], dict(future=_future, gather_binary=gather_binary, weight=weight,
														
 
															+        self._outer_pipe.send(('_step', [], dict(future=future, gather_binary=gather_binary, weight=weight,
														
 
															                                                  allow_retries=allow_retries, timeout=timeout)))
														
 
															         return future.result() if wait else future
														
@@ -463,8 +463,8 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     async def _get_current_state_from_host_process(self):
														
 
															         """ Executed in the averager process inside rpc_download_state """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._inner_pipe.send(('_TRIGGER_GET_CURRENT_STATE', _future))
														
 
															+        future = MPFuture()
														
 
															+        self._inner_pipe.send(('_TRIGGER_GET_CURRENT_STATE', future))
														
 
															         return await future
														
 
															     def load_state_from_peers(self, wait=True) -> Optional[Tuple[Any, Sequence[torch.Tensor]]]:
														
@@ -477,8 +477,8 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         The exact contents of both metadata and tensors are determined by get_current_state method
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._outer_pipe.send(('_load_state_from_peers', [], dict(future=_future)))
														
 
															+        future = MPFuture()
														
 
															+        self._outer_pipe.send(('_load_state_from_peers', [], dict(future=future)))
														
 
															         return future.result() if wait else future
														
 
															     async def _load_state_from_peers(self, future: MPFuture):
														
@@ -537,8 +537,8 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         :param wait: if True, return bits immediately. Otherwise return awaitable MPFuture
														
 
															         :returns: averager's current group key bits (without prefix)
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._outer_pipe.send(('_get_group_bits', [], dict(future=_future)))
														
 
															+        future = MPFuture()
														
 
															+        self._outer_pipe.send(('_get_group_bits', [], dict(future=future)))
														
 
															         return future.result() if wait else future
														
 
															     async def _get_group_bits(self, future: MPFuture):
														
@@ -549,9 +549,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         :param group_bits: group bits (string of '0' or '1') to be used in averager's group key
														
 
															         :param wait: if True, wait until the update is confirmed by the averager. Otherwise return immediately
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															+        future = MPFuture()
														
 
															         assert all(bit in '01' for bit in group_bits)
														
 
															-        self._outer_pipe.send(('_set_group_bits', [], dict(group_bits=group_bits, future=_future)))
														
 
															+        self._outer_pipe.send(('_set_group_bits', [], dict(group_bits=group_bits, future=future)))
														
 
															         return future.result() if wait else future
														
 
															     async def _set_group_bits(self, group_bits: str, future: MPFuture):
														
--- a/hivemind/client/averaging/training.py
+++ b/hivemind/client/averaging/training.py
@@ -1,4 +1,5 @@
 
															 """ An extension of averager that supports common optimization use cases. """
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															 from itertools import chain
														
 
															 from threading import Lock
														
 
															 from typing import Sequence, Dict, Iterator, Optional
														
@@ -7,7 +8,7 @@ from contextlib import nullcontext
 
															 import torch
														
 
															 from hivemind.client.averaging import DecentralizedAverager
														
 
															-from hivemind.utils import nested_flatten, nested_pack, get_logger, run_in_background
														
 
															+from hivemind.utils import nested_flatten, nested_pack, get_logger
														
 
															 logger = get_logger(__name__)
														
@@ -39,6 +40,7 @@ class TrainingAverager(DecentralizedAverager):
 
															         self.opt, self.extra_tensors, self.local_step = opt, tuple(extra_tensors), 0
														
 
															         self.opt_statistics = tuple(average_opt_statistics)
														
 
															         self.average_parameters, self.average_gradients = average_parameters, average_gradients
														
 
															+        self.step_executor = ThreadPoolExecutor(max_workers=1)
														
 
															         self.lock_averager_step = Lock()
														
 
															         if initialize_optimizer:
														
 
															             initialize_optimizer_state(opt)  # note: this will run one optimizer step!
														
@@ -47,15 +49,15 @@ class TrainingAverager(DecentralizedAverager):
 
															             averaged_tensors = [tensor.detach().cpu().float().clone() for tensor in self.local_tensors()]
														
 
															         super().__init__(averaged_tensors=averaged_tensors, **kwargs)
														
 
															-    @torch.no_grad()
														
 
															     def step(self, data_lock: Optional[Lock] = None, wait: bool = True, **kwargs):
														
 
															-        """ Average optimizer weights and gradients with peers.
														
 
															+        """
														
 
															+        Average optimizer weights and gradients with peers.
														
 
															+
														
 
															         :param data_lock: averager locks it when model parameters are modified. Otherwise it's assumed that no model
														
 
															         modifications occur during averaging step
														
 
															-        :param wait: if True waits, otherwise returns Future
														
 
															         """
														
 
															         if not wait:
														
 
															-            return run_in_background(self.step, data_lock, wait=True, **kwargs)
														
 
															+            return self.step_executor.submit(self.step, data_lock, wait=True, **kwargs)
														
 
															         # if data_lock is supplied, tensors might change during averaging, so we need to copy them
														
 
															         use_old_local_tensors = data_lock is not None
														
@@ -63,7 +65,7 @@ class TrainingAverager(DecentralizedAverager):
 
															             data_lock = nullcontext()
														
 
															         local_tensors = list(self.local_tensors())
														
 
															-        with self.lock_averager_step:
														
 
															+        with self.lock_averager_step, torch.no_grad():
														
 
															             # fill averager's tensors with current local tensors
														
 
															             with data_lock, self.get_tensors() as averaged_tensors:
														
 
															                 if use_old_local_tensors:
														
@@ -73,7 +75,7 @@ class TrainingAverager(DecentralizedAverager):
 
															                 for averaged_tensor, local_tensor in zip(averaged_tensors, local_tensors):
														
 
															                     averaged_tensor[...] = local_tensor.cpu().float()
														
 
															-            # find a group and hopefully average tensors with peers, scaled by peer's weight
														
 
															+            # find a group and hopefully average tensors with peers, use batch sizes as weights
														
 
															             gathered = super().step(**kwargs)
														
 
															             if gathered is not None:
														
 
															                 # load averaged tensors back into model
														
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -127,8 +127,8 @@ class DHT(mp.Process):
 
															         :param kwargs: parameters forwarded to DHTNode.get_many_by_id
														
 
															         :returns: (value, expiration time); if value was not found, returns None
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._outer_pipe.send(('_get', [], dict(key=key, latest=latest, future=_future, **kwargs)))
														
 
															+        future = MPFuture()
														
 
															+        self._outer_pipe.send(('_get', [], dict(key=key, latest=latest, future=future, **kwargs)))
														
 
															         return future if return_future else future.result()
														
 
															     async def _get(self, key: DHTKey, latest: bool, future: MPFuture, **kwargs):
														
@@ -153,9 +153,9 @@ class DHT(mp.Process):
 
															         :param return_future: if False (default), return when finished. Otherwise return MPFuture and run in background.
														
 
															         :returns: True if store succeeds, False if it fails (due to no response or newer value)
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															+        future = MPFuture()
														
 
															         self._outer_pipe.send(('_store', [], dict(key=key, value=value, expiration_time=expiration_time, subkey=subkey,
														
 
															-                                                  future=_future, **kwargs)))
														
 
															+                                                  future=future, **kwargs)))
														
 
															         return future if return_future else future.result()
														
 
															     async def _store(self, key: DHTKey, value: DHTValue, expiration_time: DHTExpiration,
														
@@ -184,8 +184,8 @@ class DHT(mp.Process):
 
															           or use asyncio.get_event_loop().run_in_executor(...) to prevent coroutine from blocking background DHT tasks
														
 
															         :note: when run_coroutine is called with wait=False, MPFuture can be cancelled to interrupt the task.
														
 
															         """
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._outer_pipe.send(('_run_coroutine', [], dict(coro=coro, future=_future)))
														
 
															+        future = MPFuture()
														
 
															+        self._outer_pipe.send(('_run_coroutine', [], dict(coro=coro, future=future)))
														
 
															         return future if return_future else future.result()
														
 
															     async def _run_coroutine(self, coro: Callable[[DHT, DHTNode], Awaitable[ReturnType]],
														
@@ -226,8 +226,8 @@ class DHT(mp.Process):
 
															         """
														
 
															         assert num_peers is None or peers == (), "please specify either a num_peers or the list of peers, not both"
														
 
															         assert not isinstance(peers, str) and isinstance(peers, Sequence), "Please send a list / tuple of endpoints"
														
 
															-        future, _future = MPFuture.make_pair()
														
 
															-        self._outer_pipe.send(('_get_visible_address', [], dict(num_peers=num_peers, peers=peers, future=_future)))
														
 
															+        future = MPFuture()
														
 
															+        self._outer_pipe.send(('_get_visible_address', [], dict(num_peers=num_peers, peers=peers, future=future)))
														
 
															         return future.result()
														
 
															     async def _get_visible_address(self, num_peers: Optional[int], peers: Sequence[Endpoint],
														
--- a/hivemind/hivemind_cli/run_server.py
+++ b/hivemind/hivemind_cli/run_server.py
@@ -6,7 +6,7 @@ import torch
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															 from hivemind.server import Server
														
 
															-from hivemind.utils.threading import increase_file_limit
														
 
															+from hivemind.utils.limits import increase_file_limit
														
 
															 from hivemind.utils.logging import get_logger
														
 
															 from hivemind.server.layers import schedule_name_to_scheduler
														
--- a/hivemind/server/task_pool.py
+++ b/hivemind/server/task_pool.py
@@ -14,7 +14,8 @@ from typing import List, Tuple, Dict, Any, Generator
 
															 import torch
														
 
															-from hivemind.utils import MPFuture, get_logger, FutureStateError
														
 
															+from hivemind.utils import get_logger
														
 
															+from hivemind.utils.mpfuture import MPFuture, InvalidStateError
														
 
															 logger = get_logger(__name__)
														
 
															 Task = namedtuple("Task", ("future", "args"))
														
@@ -89,15 +90,14 @@ class TaskPool(TaskPoolBase):
 
															     def submit_task(self, *args: torch.Tensor) -> Future:
														
 
															         """ Add task to this pool's queue, return Future for its output """
														
 
															-        future1, future2 = MPFuture.make_pair()
														
 
															-        task = Task(future1, args)
														
 
															+        task = Task(MPFuture(), args)
														
 
															         if self.get_task_size(task) > self.max_batch_size:
														
 
															             exc = ValueError(f"Task size greater than max_batch_size ({self.max_batch_size}), it can't be processed")
														
 
															-            future2.set_exception(exc)
														
 
															+            task.future.set_exception(exc)
														
 
															         else:
														
 
															             self.tasks.put(task)
														
 
															             self.undispatched_task_timestamps.put(time.time())
														
 
															-        return future2
														
 
															+        return task.future
														
 
															     def iterate_minibatches(self, *args, **kwargs):
														
 
															         """ Form minibatches by grouping one or more tasks together up to self.max_batch_size """
														
@@ -127,7 +127,7 @@ class TaskPool(TaskPoolBase):
 
															                 if task.future.set_running_or_notify_cancel():
														
 
															                     batch.append(task)
														
 
															                     total_size += task_size
														
 
															-            except FutureStateError as e:
														
 
															+            except InvalidStateError as e:
														
 
															                 logger.debug(f"Failed to add task to batch: {task.future} raised {e}")
														
 
															     def run(self, *args, **kwargs):
														
@@ -196,7 +196,7 @@ class TaskPool(TaskPoolBase):
 
															             for task, task_outputs in zip(batch_tasks, outputs_per_task):
														
 
															                 try:
														
 
															                     task.future.set_result(tuple(task_outputs))
														
 
															-                except FutureStateError as e:
														
 
															+                except InvalidStateError as e:
														
 
															                     logger.debug(f"Failed to send task result due to an exception: {e}")
														
 
															     @property
														
--- a/hivemind/utils/__init__.py
+++ b/hivemind/utils/__init__.py
@@ -1,11 +1,11 @@
 
															 from hivemind.utils.asyncio import *
														
 
															 from hivemind.utils.compression import serialize_torch_tensor, deserialize_torch_tensor
														
 
															 from hivemind.utils.grpc import *
														
 
															+from hivemind.utils.limits import increase_file_limit
														
 
															 from hivemind.utils.logging import get_logger
														
 
															 from hivemind.utils.mpfuture import *
														
 
															 from hivemind.utils.nested import *
														
 
															 from hivemind.utils.networking import *
														
 
															 from hivemind.utils.serializer import *
														
 
															 from hivemind.utils.tensor_descr import *
														
 
															-from hivemind.utils.threading import *
														
 
															 from hivemind.utils.timed_storage import *
														
--- a/hivemind/utils/compression.py
+++ b/hivemind/utils/compression.py
@@ -1,3 +1,5 @@
 
															+import os
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															 from typing import Tuple, Sequence, Optional
														
 
															 import numpy as np
														
@@ -6,7 +8,7 @@ import warnings
 
															 from hivemind.proto import runtime_pb2
														
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															-from hivemind.utils.threading import run_in_background
														
 
															+
														
 
															 FP32_EPS = 1e-06
														
 
															 NUM_BYTES_FLOAT32 = 4
														
@@ -17,6 +19,8 @@ UNIFORM_BUCKETS_STD_RANGE = 6
 
															 FP16_MAX = 65_504
														
 
															 UINT8_RANGE = 256
														
 
															+COMPRESSION_EXECUTOR = ThreadPoolExecutor(max_workers=int(os.environ.get("QUANTILE_COMPRESSION_THREADS", 128)))
														
 
															+
														
 
															 warnings.filterwarnings("ignore", message="The given NumPy array is not writeable", category=UserWarning)
														
@@ -48,8 +52,7 @@ def _quantile_qq_approximation(array: np.array, n_quantiles: int, min_chunk_size
 
															     jobs = []
														
 
															     for i in range(num_chunks):
														
 
															         chunk = slice(chunk_size * i, chunk_size * (i + 1))
														
 
															-        jobs.append(run_in_background(
														
 
															-            np.quantile, array[chunk], quantiles, out=partition_quantiles[i]))
														
 
															+        jobs.append(COMPRESSION_EXECUTOR.submit(np.quantile, array[chunk], quantiles, out=partition_quantiles[i]))
														
 
															     for job in jobs:
														
 
															         job.result()
														
--- a/hivemind/utils/threading.py
+++ b/hivemind/utils/threading.py
@@ -1,21 +1,7 @@
 
															-import os
														
 
															-from concurrent.futures import Future, ThreadPoolExecutor
														
 
															-
														
 
															 from hivemind.utils.logging import get_logger
														
 
															 logger = get_logger(__name__)
														
 
															-EXECUTOR_PID, GLOBAL_EXECUTOR = None, None
														
 
															-
														
 
															-
														
 
															-def run_in_background(func: callable, *args, **kwargs) -> Future:
														
 
															-    """ run func(*args, **kwargs) in background and return Future for its outputs """
														
 
															-    global EXECUTOR_PID, GLOBAL_EXECUTOR
														
 
															-    if os.getpid() != EXECUTOR_PID:
														
 
															-        GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=int(os.environ.get("HIVEMIND_THREADS", 128)))
														
 
															-        EXECUTOR_PID = os.getpid()
														
 
															-    return GLOBAL_EXECUTOR.submit(func, *args, **kwargs)
														
 
															-
														
 
															 def increase_file_limit(new_soft=2 ** 15, new_hard=2 ** 15):
														
 
															     """ Increase the maximum number of open files. On Linux, this allows spawning more processes/threads. """
														
--- a/hivemind/utils/mpfuture.py
+++ b/hivemind/utils/mpfuture.py
@@ -2,171 +2,262 @@ from __future__ import annotations
 
															 import asyncio
														
 
															 import concurrent.futures._base as base
														
 
															+from contextlib import nullcontext
														
 
															 import multiprocessing as mp
														
 
															 import multiprocessing.connection
														
 
															-import time
														
 
															-from functools import lru_cache
														
 
															-from typing import Optional, Tuple, Generic, TypeVar
														
 
															+import os
														
 
															+import threading
														
 
															+import uuid
														
 
															+from enum import Enum, auto
														
 
															+from typing import Generic, TypeVar, Dict, Optional, Any, Callable
														
 
															-from hivemind.utils.threading import run_in_background
														
 
															+import torch    # used for py3.7-compatible shared memory
														
 
															+from hivemind.utils.logging import get_logger
														
 
															+
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+# flavour types
														
 
															 ResultType = TypeVar('ResultType')
														
 
															+PID, UID, State, PipeEnd = int, int, str, mp.connection.Connection
														
 
															+ALL_STATES = base.PENDING, base.RUNNING, base.FINISHED, base.CANCELLED, base.CANCELLED_AND_NOTIFIED
														
 
															+TERMINAL_STATES = {base.FINISHED, base.CANCELLED, base.CANCELLED_AND_NOTIFIED}
														
 
															+try:
														
 
															+    from concurrent.futures import InvalidStateError
														
 
															+except ImportError:
														
 
															+    # Python 3.7 doesn't raise concurrent.futures.InvalidStateError for repeating set_result/set_exception calls and
														
 
															+    # doesn't even define this error. In this module, we simulate the Python 3.8+ behavior,
														
 
															+    # defining and raising this error if necessary.
														
 
															+    class InvalidStateError(Exception):
														
 
															+        """Raised when attempting to change state of a future in a terminal state (e.g. finished)"""
														
 
															-class FutureStateError(RuntimeError):
														
 
															-    """Raised when attempting to change state of a future in a terminal state (e.g. finished)"""
														
 
															-    pass
														
 
															+
														
 
															+class UpdateType(Enum):
														
 
															+    RESULT = auto()
														
 
															+    EXCEPTION = auto()
														
 
															+    CANCEL = auto()
														
 
															 class MPFuture(base.Future, Generic[ResultType]):
														
 
															-    """ Multiprocessing version of concurrent.futures.Future. Can also be awaited like asyncio.Future """
														
 
															+    """
														
 
															+    A version of concurrent.futures.Future / asyncio.Future that can be fulfilled from a separate process.
														
 
															+    Any process can access future status and set the result / exception and check for state.
														
 
															+    However, only the original process (i.e. the process that created the future) can await the result or exception.
														
 
															+
														
 
															+    :param use_lock: if True, operations with MPFuture use a global lock to prevent concurrent writes to the same pipe;
														
 
															+      If set to False, writing to this future ignores global lock, slightly improving performance, but making user
														
 
															+      responsible for avoiding concurrent set_result / set_exception calls to futures with the same process of origin.
														
 
															+    :param loop: if specified, overrides default asyncio event loop for the purpose of awaiting MPFuture
														
 
															+
														
 
															+    :note: This is an internal primitive that is not guaranteed to work outside of hivemind applications.
														
 
															+     More specifically, there are two known limitations:
														
 
															+       - MPFuture works between processes created through inheritance (e.g. fork), *not* for independent processes
														
 
															+       - MPFuture is deterministic if only one process can call set_result/set_exception/set_running_or_notify_cancel
														
 
															+         and only the origin process can call result/exception/cancel.
														
 
															+    """
														
 
															+    _initialization_lock = mp.Lock()  # global lock that prevents simultaneous initialization of two processes
														
 
															+    _update_lock = mp.Lock()  # global lock that prevents simultaneous writing to the same pipe
														
 
															+    _global_sender_pipe: Optional[PipeEnd] = None  # a pipe that is used to send results/exceptions to this process
														
 
															+    _pipe_waiter_thread: Optional[threading.Thread] = None  # process-specific thread that receives results/exceptions
														
 
															+    _active_futures: Optional[Dict[UID, MPFuture]] = None  # pending or running futures originated from current process
														
 
															+    _active_pid: Optional[PID] = None  # pid of currently active process; used to handle forks natively
														
 
															-    TERMINAL_STATES = {base.FINISHED, base.CANCELLED, base.CANCELLED_AND_NOTIFIED}
														
 
															+    def __init__(self, use_lock: bool = True, loop: Optional[asyncio.BaseEventLoop] = None):
														
 
															+        self._origin_pid, self._uid = os.getpid(), uuid.uuid4().int
														
 
															+        self._shared_state_code = torch.empty([], dtype=torch.uint8).share_memory_()
														
 
															+        self._state_cache:  Dict[State, State] = {}  # mapping from global to cached local future used that makes updates immediately
														
 
															+        # available on setter side; dictionary-based cache works because future can visit any state at most once
														
 
															-    def __init__(self, connection: mp.connection.Connection):
														
 
															-        """ manually create MPFuture. Please use MPFuture.make_pair instead """
														
 
															+        base.Future.__init__(self)   # parent init is deferred because it uses self._shared_state_code
														
 
															         self._state, self._result, self._exception = base.PENDING, None, None
														
 
															-        self.connection = connection
														
 
															+        self._use_lock = use_lock
														
 
															-    @classmethod
														
 
															-    def make_pair(cls) -> Tuple[MPFuture, MPFuture]:
														
 
															-        """ Create a pair of linked futures to be used in two processes """
														
 
															-        connection1, connection2 = mp.Pipe()
														
 
															-        return cls(connection1), cls(connection2)
														
 
															+        if self._origin_pid != MPFuture._active_pid:
														
 
															+            with MPFuture._initialization_lock:
														
 
															+                if self._origin_pid != MPFuture._active_pid:
														
 
															+                    # note: the second if is intentional, see https://en.wikipedia.org/wiki/Double-checked_locking
														
 
															+                    self._initialize_mpfuture_backend()
														
 
															+        assert self._uid not in MPFuture._active_futures
														
 
															+        MPFuture._active_futures[self._uid] = self
														
 
															+        self._sender_pipe = MPFuture._global_sender_pipe
														
 
															-    def _send_updates(self):
														
 
															-        """ Send updates to a paired MPFuture """
														
 
															         try:
														
 
															-            self.connection.send((self._state, self._result, self._exception))
														
 
															-            if self._state in self.TERMINAL_STATES:
														
 
															-                self._shutdown_trigger.set_result(True)
														
 
															-                self.connection.close()
														
 
															-            return True
														
 
															-        except BrokenPipeError:
														
 
															-            return False
														
 
															+            self._loop = loop or asyncio.get_event_loop()
														
 
															+            self._aio_event = asyncio.Event()
														
 
															+        except RuntimeError:
														
 
															+            self._loop, self._aio_event = None, None
														
 
															-    def _recv_updates(self, timeout: Optional[float]):
														
 
															-        """ Await updates from a paired MPFuture """
														
 
															-        try:
														
 
															-            future = base.wait([run_in_background(self.connection.poll, timeout), self._shutdown_trigger],
														
 
															-                               return_when=base.FIRST_COMPLETED)[0].pop()
														
 
															-            if future is self._shutdown_trigger:
														
 
															-                raise BrokenPipeError()
														
 
															-            if not future.result():
														
 
															-                raise TimeoutError()
														
 
															-            self._state, result, exception = self.connection.recv()
														
 
															-            self._result = result if result is not None else self._result
														
 
															-            self._exception = exception if exception is not None else self._exception
														
 
															-            if self._state in self.TERMINAL_STATES:
														
 
															-                self.connection.close()
														
 
															-        except TimeoutError as e:
														
 
															-            raise e
														
 
															-        except (BrokenPipeError, OSError, EOFError) as e:
														
 
															-            if self._state in (base.PENDING, base.RUNNING):
														
 
															-                self._state, self._exception = base.FINISHED, e
														
 
															-
														
 
															-    def _await_terminal_state(self, timeout: Optional[float]):
														
 
															-        """ Await updates until future is either finished, cancelled or got an exception """
														
 
															-        time_left = float('inf') if timeout is None else timeout
														
 
															-        time_before = time.monotonic()
														
 
															-        while self._state not in self.TERMINAL_STATES and time_left > 0:
														
 
															-            self._recv_updates(time_left if timeout else None)
														
 
															-            time_spent = time.monotonic() - time_before
														
 
															-            time_left, time_before = time_left - time_spent, time_before + time_spent
														
 
															-
														
 
															-    def _sync_updates(self):
														
 
															-        """ Apply queued updates from a paired MPFuture without waiting for new ones """
														
 
															+    @property
														
 
															+    def _state(self) -> State:
														
 
															+        shared_state = ALL_STATES[self._shared_state_code.item()]
														
 
															+        return self._state_cache.get(shared_state, shared_state)
														
 
															+
														
 
															+    @_state.setter
														
 
															+    def _state(self, new_state: State):
														
 
															+        self._shared_state_code[...] = ALL_STATES.index(new_state)
														
 
															+        if self._state in TERMINAL_STATES and self._loop is not None and not self._aio_event.is_set():
														
 
															+            self._set_event_threadsafe()
														
 
															+
														
 
															+    def _set_event_threadsafe(self):
														
 
															         try:
														
 
															-            self._recv_updates(timeout=0)
														
 
															-        except TimeoutError:
														
 
															-            pass
														
 
															+            loop = asyncio.get_running_loop()
														
 
															+        except RuntimeError:
														
 
															+            loop = None
														
 
															+
														
 
															+        async def _event_setter():
														
 
															+            self._aio_event.set()
														
 
															+
														
 
															+        if loop == self.get_loop():
														
 
															+            asyncio.create_task(_event_setter())
														
 
															+        else:
														
 
															+            asyncio.run_coroutine_threadsafe(_event_setter(), self._loop)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _initialize_mpfuture_backend(cls):
														
 
															+        pid = os.getpid()
														
 
															+        logger.debug(f"Initializing MPFuture backend for pid {pid}")
														
 
															+        assert pid != cls._active_pid, "already initialized"
														
 
															+
														
 
															+        receiver_pipe, cls._global_sender_pipe = mp.Pipe(duplex=False)
														
 
															+        cls._active_pid, cls._active_futures = pid, {}
														
 
															+        cls._pipe_waiter_thread = threading.Thread(target=cls._process_updates_in_background, args=[receiver_pipe],
														
 
															+                                                   name=f'{__name__}.BACKEND', daemon=True)
														
 
															+        cls._pipe_waiter_thread.start()
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _process_updates_in_background(cls, receiver_pipe: mp.connection.Connection):
														
 
															+        pid = os.getpid()
														
 
															+        while True:
														
 
															+            try:
														
 
															+                uid, update_type, payload = receiver_pipe.recv()
														
 
															+                if uid not in cls._active_futures:
														
 
															+                    logger.debug(f"Ignoring update to future with uid={uid}: the future is already done or destroyed")
														
 
															+                elif update_type == UpdateType.RESULT:
														
 
															+                    cls._active_futures.pop(uid).set_result(payload)
														
 
															+                elif update_type == UpdateType.EXCEPTION:
														
 
															+                    cls._active_futures.pop(uid).set_exception(payload)
														
 
															+                elif update_type == UpdateType.CANCEL:
														
 
															+                    cls._active_futures.pop(uid).cancel()
														
 
															+                else:
														
 
															+                    raise RuntimeError(f"Received unexpected update type {update_type}")
														
 
															+            except (BrokenPipeError, EOFError):
														
 
															+                logger.debug(f"Update pipe was was shut down unexpectedly (pid={pid})")
														
 
															+            except Exception as e:
														
 
															+                logger.exception(f"Could not retrieve update: caught {repr(e)} (pid={pid})")
														
 
															+
														
 
															+    def _send_update(self, update_type: UpdateType, payload: Any = None):
														
 
															+        """ This method sends result, exception or cancel to the MPFuture origin. """
														
 
															+        with MPFuture._update_lock if self._use_lock else nullcontext():
														
 
															+            self._sender_pipe.send((self._uid, update_type, payload))
														
 
															     def set_result(self, result: ResultType):
														
 
															-        self._sync_updates()
														
 
															-        if self._state in self.TERMINAL_STATES:
														
 
															-            raise FutureStateError(f"Can't set_result to a future that is {self._state} ({self})")
														
 
															-        self._state, self._result = base.FINISHED, result
														
 
															-        return self._send_updates()
														
 
															-
														
 
															-    def set_exception(self, exception: BaseException):
														
 
															-        self._sync_updates()
														
 
															-        if self._state in self.TERMINAL_STATES:
														
 
															-            raise FutureStateError(f"Can't set_exception to a future that is {self._state} ({self})")
														
 
															-        self._state, self._exception = base.FINISHED, exception
														
 
															-        self._send_updates()
														
 
															+        if os.getpid() == self._origin_pid:
														
 
															+            super().set_result(result)
														
 
															+            MPFuture._active_futures.pop(self._uid, None)
														
 
															+        elif self._state in TERMINAL_STATES:
														
 
															+            raise InvalidStateError(f"Can't set_result to a future that is {self._state} ({self._uid})")
														
 
															+        else:
														
 
															+            self._state_cache[self._state], self._result = base.FINISHED, result
														
 
															+            self._send_update(UpdateType.RESULT, result)
														
 
															+
														
 
															+    def set_exception(self, exception: Optional[BaseException]):
														
 
															+        if os.getpid() == self._origin_pid:
														
 
															+            super().set_exception(exception)
														
 
															+            MPFuture._active_futures.pop(self._uid, None)
														
 
															+        elif self._state in TERMINAL_STATES:
														
 
															+            raise InvalidStateError(f"Can't set_exception to a future that is {self._state} ({self._uid})")
														
 
															+        else:
														
 
															+            self._state_cache[self._state], self._exception = base.FINISHED, exception
														
 
															+            self._send_update(UpdateType.EXCEPTION, exception)
														
 
															+
														
 
															+    def cancel(self) -> bool:
														
 
															+        if os.getpid() == self._origin_pid:
														
 
															+            MPFuture._active_futures.pop(self._uid, None)
														
 
															+            return super().cancel()
														
 
															+        elif self._state in [base.RUNNING, base.FINISHED]:
														
 
															+            return False
														
 
															+        else:
														
 
															+            self._state_cache[self._state] = base.CANCELLED
														
 
															+            self._send_update(UpdateType.CANCEL)
														
 
															+            return True
														
 
															     def set_running_or_notify_cancel(self):
														
 
															-        self._sync_updates()
														
 
															         if self._state == base.PENDING:
														
 
															             self._state = base.RUNNING
														
 
															-            return self._send_updates()
														
 
															+            return True
														
 
															         elif self._state == base.CANCELLED:
														
 
															             return False
														
 
															         else:
														
 
															-            raise FutureStateError(f"Can't set_running_or_notify_cancel to a future that is in {self._state} ({self})")
														
 
															-
														
 
															-    def cancel(self):
														
 
															-        self._sync_updates()
														
 
															-        if self._state in self.TERMINAL_STATES:
														
 
															-            return False
														
 
															-        self._state, self._exception = base.CANCELLED, base.CancelledError()
														
 
															-        return self._send_updates()
														
 
															+            raise InvalidStateError(f"Can't set_running_or_notify_cancel when future is in {self._state} ({self._uid})")
														
 
															     def result(self, timeout: Optional[float] = None) -> ResultType:
														
 
															-        self._await_terminal_state(timeout)
														
 
															-        if self._exception is not None:
														
 
															+        if self._state not in TERMINAL_STATES:
														
 
															+            if os.getpid() != self._origin_pid:
														
 
															+                raise RuntimeError("Only the process that created MPFuture can await result")
														
 
															+            return super().result(timeout)
														
 
															+        elif self._state == base.CANCELLED:
														
 
															+            raise base.CancelledError()
														
 
															+        elif self._exception:
														
 
															             raise self._exception
														
 
															-        return self._result
														
 
															+        else:
														
 
															+            return self._result
														
 
															-    def exception(self, timeout=None) -> BaseException:
														
 
															-        self._await_terminal_state(timeout)
														
 
															-        if self._state == base.CANCELLED:
														
 
															+    def exception(self, timeout: Optional[float] = None) -> Optional[BaseException]:
														
 
															+        if self._state not in TERMINAL_STATES:
														
 
															+            if os.getpid() != self._origin_pid:
														
 
															+                raise RuntimeError("Only the process that created MPFuture can await exception")
														
 
															+            return super().exception(timeout)
														
 
															+        elif self._state == base.CANCELLED:
														
 
															             raise base.CancelledError()
														
 
															         return self._exception
														
 
															     def done(self) -> bool:
														
 
															-        self._sync_updates()
														
 
															-        return self._state in self.TERMINAL_STATES
														
 
															+        return self._state in TERMINAL_STATES
														
 
															     def running(self):
														
 
															-        self._sync_updates()
														
 
															         return self._state == base.RUNNING
														
 
															     def cancelled(self):
														
 
															-        self._sync_updates()
														
 
															         return self._state == base.CANCELLED
														
 
															-    def add_done_callback(self, callback):
														
 
															-        raise NotImplementedError(f"MPFuture doesn't support callbacks.")
														
 
															-
														
 
															-    def remove_done_callback(self, callback):
														
 
															-        raise NotImplementedError(f"MPFuture doesn't support callbacks.")
														
 
															+    def add_done_callback(self, callback: Callable[[MPFuture], None]):
														
 
															+        if os.getpid() != self._origin_pid:
														
 
															+            raise RuntimeError("Only the process that created MPFuture can set callbacks")
														
 
															+        return super().add_done_callback(callback)
														
 
															-    def get_loop(self):
														
 
															-        raise NotImplementedError(f"MPFuture doesn't support get_loop")
														
 
															-
														
 
															-    @property
														
 
															-    @lru_cache()
														
 
															-    def _shutdown_trigger(self):
														
 
															-        return base.Future()
														
 
															-
														
 
															-    def __repr__(self):
														
 
															-        self._sync_updates()
														
 
															-        if self._state == base.FINISHED:
														
 
															-            if self._exception:
														
 
															-                return "<MPFuture at 0x{:x} state=finished raised {}>".format(id(self), type(self._exception))
														
 
															-            else:
														
 
															-                return "<MPFuture at 0x{:x} state=finished returned {}>".format(id(self), type(self._result))
														
 
															-        else:
														
 
															-            return "<MPFuture at 0x{:x} state={}>".format(id(self), self._state)
														
 
															+    def get_loop(self) -> Optional[asyncio.BaseEventLoop]:
														
 
															+        return self._loop
														
 
															     def __await__(self):
														
 
															-        yield from asyncio.get_running_loop().run_in_executor(None, self._await_terminal_state, None).__await__()
														
 
															-        if self._exception:
														
 
															-            raise self._exception
														
 
															-        return self._result
														
 
															+        if not self._aio_event:
														
 
															+            raise RuntimeError("Can't await: MPFuture was created with no event loop")
														
 
															+        yield from self._aio_event.wait().__await__()
														
 
															+        try:
														
 
															+            return super().result(timeout=0)
														
 
															+        except base.CancelledError:
														
 
															+            raise asyncio.CancelledError()
														
 
															     def __del__(self):
														
 
															-        self._shutdown_trigger.set_result(True)
														
 
															-        if hasattr(self, 'connection'):
														
 
															-            self.connection.close()
														
 
															+        if getattr(self, '_origin_pid', None) == os.getpid():
														
 
															+            MPFuture._active_futures.pop(self._uid, None)
														
 
															+        if getattr(self, '_aio_event', None):
														
 
															+            self._aio_event.set()
														
 
															+
														
 
															+    def __getstate__(self):
														
 
															+        return dict(_sender_pipe=self._sender_pipe, _shared_state_code=self._shared_state_code,
														
 
															+                    _origin_pid=self._origin_pid, _uid=self._uid, _use_lock=self._use_lock,
														
 
															+                    _result=self._result, _exception=self._exception)
														
 
															+
														
 
															+    def __setstate__(self, state):
														
 
															+        self._sender_pipe = state['_sender_pipe']
														
 
															+        self._shared_state_code = state['_shared_state_code']
														
 
															+        self._origin_pid, self._uid = state['_origin_pid'], state['_uid']
														
 
															+        self._result, self._exception = state['_result'], state['_exception']
														
 
															+        self._use_lock = state['_use_lock']
														
 
															+
														
 
															+        self._waiters, self._done_callbacks = [], []
														
 
															+        self._condition = threading.Condition()
														
 
															+        self._aio_event, self._loop = None, None
														
 
															+        self._state_cache = {}
														
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -423,3 +423,7 @@ def test_training_averager(n_steps: int = 10, n_dims: int = 16):
 
															         assert torch.allclose(x2.grad, grad_avg)
														
 
															         assert torch.allclose(opt1.state[x1]["exp_avg_sq"], stats_avg)
														
 
															         assert torch.allclose(opt2.state[x2]["exp_avg_sq"], stats_avg)
														
 
															+
														
 
															+    averager1.shutdown()
														
 
															+    averager2.shutdown()
														
 
															+    dht.shutdown()
														
--- a/tests/test_util_modules.py
+++ b/tests/test_util_modules.py
@@ -1,129 +1,310 @@
 
															 import asyncio
														
 
															-from concurrent.futures import CancelledError
														
 
															+import concurrent.futures
														
 
															+import multiprocessing as mp
														
 
															+import random
														
 
															+import time
														
 
															-import numpy as np
														
 
															 import pytest
														
 
															 import torch
														
 
															+import numpy as np
														
 
															+import hivemind
														
 
															 from hivemind.proto.dht_pb2_grpc import DHTStub
														
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															 from hivemind.proto.runtime_pb2_grpc import ConnectionHandlerStub
														
 
															-import hivemind
														
 
															 from hivemind.utils import MSGPackSerializer
														
 
															 from hivemind.utils.compression import serialize_torch_tensor, deserialize_torch_tensor
														
 
															 from hivemind.utils.asyncio import amap_in_executor, aiter, aenumerate, achain, anext, azip
														
 
															-from hivemind.utils.mpfuture import FutureStateError
														
 
															+from hivemind.utils.mpfuture import InvalidStateError
														
 
															+@pytest.mark.forked
														
 
															 def test_mpfuture_result():
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    f1.set_result(321)
														
 
															-    assert f2.result() == 321
														
 
															-    assert f1.result() == 321
														
 
															+    future = hivemind.MPFuture()
														
 
															-    for future in [f1, f2]:
														
 
															-        with pytest.raises(FutureStateError):
														
 
															-            future.set_result(123)
														
 
															-        with pytest.raises(FutureStateError):
														
 
															-            future.set_exception(ValueError())
														
 
															-        assert future.cancel() is False
														
 
															-        assert future.done() and not future.running() and not future.cancelled()
														
 
															+    def _proc(future):
														
 
															+        with pytest.raises(RuntimeError):
														
 
															+            future.result()  # only creator process can await result
														
 
															+
														
 
															+        future.set_result(321)
														
 
															+
														
 
															+    p = mp.Process(target=_proc, args=(future,))
														
 
															+    p.start()
														
 
															+    p.join()
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    with pytest.raises(TimeoutError):
														
 
															-        f1.result(timeout=1e-3)
														
 
															+    assert future.result() == 321
														
 
															+    assert future.exception() is None
														
 
															+    assert future.cancel() is False
														
 
															+    assert future.done() and not future.running() and not future.cancelled()
														
 
															-    f2.set_result(['abacaba', 123])
														
 
															-    assert f1.result() == ['abacaba', 123]
														
 
															+    future = hivemind.MPFuture()
														
 
															+    with pytest.raises(concurrent.futures.TimeoutError):
														
 
															+        future.result(timeout=1e-3)
														
 
															+    future.set_result(['abacaba', 123])
														
 
															+    assert future.result() == ['abacaba', 123]
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															 def test_mpfuture_exception():
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    with pytest.raises(TimeoutError):
														
 
															-        f1.exception(timeout=1e-3)
														
 
															+    future = hivemind.MPFuture()
														
 
															+    with pytest.raises(concurrent.futures.TimeoutError):
														
 
															+        future.exception(timeout=1e-3)
														
 
															-    f2.set_exception(NotImplementedError())
														
 
															+    def _proc(future):
														
 
															+        future.set_exception(NotImplementedError())
														
 
															-    for future in [f1, f2]:
														
 
															-        assert isinstance(future.exception(), NotImplementedError)
														
 
															-        with pytest.raises(NotImplementedError):
														
 
															-            future.result()
														
 
															-        assert future.cancel() is False
														
 
															-        assert future.done() and not future.running() and not future.cancelled()
														
 
															+    p = mp.Process(target=_proc, args=(future,))
														
 
															+    p.start()
														
 
															+    p.join()
														
 
															+
														
 
															+    assert isinstance(future.exception(), NotImplementedError)
														
 
															+    with pytest.raises(NotImplementedError):
														
 
															+        future.result()
														
 
															+    assert future.cancel() is False
														
 
															+    assert future.done() and not future.running() and not future.cancelled()
														
 
															+@pytest.mark.forked
														
 
															 def test_mpfuture_cancel():
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    assert not f2.cancelled()
														
 
															-    f1.cancel()
														
 
															-    for future in [f1, f2]:
														
 
															-        with pytest.raises(CancelledError):
														
 
															+    future = hivemind.MPFuture()
														
 
															+    assert not future.cancelled()
														
 
															+    future.cancel()
														
 
															+    evt = mp.Event()
														
 
															+
														
 
															+    def _proc():
														
 
															+        with pytest.raises(concurrent.futures.CancelledError):
														
 
															             future.result()
														
 
															-        with pytest.raises(CancelledError):
														
 
															+        with pytest.raises(concurrent.futures.CancelledError):
														
 
															             future.exception()
														
 
															-        with pytest.raises(FutureStateError):
														
 
															+        with pytest.raises(InvalidStateError):
														
 
															             future.set_result(123)
														
 
															-        with pytest.raises(FutureStateError):
														
 
															+        with pytest.raises(InvalidStateError):
														
 
															             future.set_exception(NotImplementedError())
														
 
															         assert future.cancelled() and future.done() and not future.running()
														
 
															+        evt.set()
														
 
															+    p = mp.Process(target=_proc)
														
 
															+    p.start()
														
 
															+    p.join()
														
 
															+    assert evt.is_set()
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															 def test_mpfuture_status():
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    assert f1.set_running_or_notify_cancel() is True
														
 
															-    for future in [f1, f2]:
														
 
															-        assert future.running() and not future.done() and not future.cancelled()
														
 
															-        with pytest.raises(RuntimeError):
														
 
															-            future.set_running_or_notify_cancel()
														
 
															-    f2.cancel()
														
 
															-    for future in [f1, f2]:
														
 
															+    evt = mp.Event()
														
 
															+    future = hivemind.MPFuture()
														
 
															+
														
 
															+    def _proc1(future):
														
 
															+        assert future.set_running_or_notify_cancel() is True
														
 
															+        evt.set()
														
 
															+
														
 
															+    p = mp.Process(target=_proc1, args=(future,))
														
 
															+    p.start()
														
 
															+    p.join()
														
 
															+    assert evt.is_set()
														
 
															+    evt.clear()
														
 
															+
														
 
															+    assert future.running() and not future.done() and not future.cancelled()
														
 
															+    with pytest.raises(InvalidStateError):
														
 
															+        future.set_running_or_notify_cancel()
														
 
															+
														
 
															+    future = hivemind.MPFuture()
														
 
															+    assert future.cancel()
														
 
															+
														
 
															+    def _proc2(future):
														
 
															         assert not future.running() and future.done() and future.cancelled()
														
 
															         assert future.set_running_or_notify_cancel() is False
														
 
															+        evt.set()
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															-    f1.cancel()
														
 
															-    for future in [f1, f2]:
														
 
															-        assert future.set_running_or_notify_cancel() is False
														
 
															+    p = mp.Process(target=_proc2, args=(future,))
														
 
															+    p.start()
														
 
															+    p.join()
														
 
															+    evt.set()
														
 
															+
														
 
															+    future2 = hivemind.MPFuture()
														
 
															+    future2.cancel()
														
 
															+    assert future2.set_running_or_notify_cancel() is False
														
 
															 @pytest.mark.asyncio
														
 
															 async def test_await_mpfuture():
														
 
															-    # await result
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    # await result from the same process, but a different coroutine
														
 
															+    f1, f2 = hivemind.MPFuture(), hivemind.MPFuture()
														
 
															-    async def wait_and_assign():
														
 
															+    async def wait_and_assign_async():
														
 
															         assert f2.set_running_or_notify_cancel() is True
														
 
															         await asyncio.sleep(0.1)
														
 
															-        f2.set_result((123, 'ololo'))
														
 
															+        f1.set_result((123, 'ololo'))
														
 
															+        f2.set_result((456, 'pyshpysh'))
														
 
															+
														
 
															+    asyncio.create_task(wait_and_assign_async())
														
 
															-    asyncio.create_task(wait_and_assign())
														
 
															-    for future in [f1, f2]:
														
 
															-        res = await future
														
 
															-        assert res == (123, 'ololo')
														
 
															+    assert (await asyncio.gather(f1, f2)) == [(123, 'ololo'), (456, 'pyshpysh')]
														
 
															+
														
 
															+    # await result from separate processes
														
 
															+    f1, f2 = hivemind.MPFuture(), hivemind.MPFuture()
														
 
															+
														
 
															+    def wait_and_assign(future, value):
														
 
															+        time.sleep(0.1 * random.random())
														
 
															+        future.set_result(value)
														
 
															+
														
 
															+    p1 = mp.Process(target=wait_and_assign, args=(f1, 'abc'))
														
 
															+    p2 = mp.Process(target=wait_and_assign, args=(f2, 'def'))
														
 
															+    for p in p1, p2:
														
 
															+        p.start()
														
 
															+
														
 
															+    assert (await asyncio.gather(f1, f2)) == ['abc', 'def']
														
 
															+    for p in p1, p2:
														
 
															+        p.join()
														
 
															     # await cancel
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    f1, f2 = hivemind.MPFuture(), hivemind.MPFuture()
														
 
															-    async def wait_and_cancel():
														
 
															-        await asyncio.sleep(0.1)
														
 
															+    def wait_and_cancel():
														
 
															+        time.sleep(0.01)
														
 
															+        f2.set_result(123456)
														
 
															+        time.sleep(0.1)
														
 
															         f1.cancel()
														
 
															-    asyncio.create_task(wait_and_cancel())
														
 
															-    for future in [f1, f2]:
														
 
															-        with pytest.raises(CancelledError):
														
 
															-            await future
														
 
															+    p = mp.Process(target=wait_and_cancel)
														
 
															+    p.start()
														
 
															+
														
 
															+    with pytest.raises(asyncio.CancelledError):
														
 
															+        # note: it is intended that MPFuture raises Cancel
														
 
															+        await asyncio.gather(f1, f2)
														
 
															+
														
 
															+    p.join()
														
 
															     # await exception
														
 
															-    f1, f2 = hivemind.MPFuture.make_pair()
														
 
															+    f1, f2 = hivemind.MPFuture(), hivemind.MPFuture()
														
 
															-    async def wait_and_raise():
														
 
															-        await asyncio.sleep(0.1)
														
 
															-        f1.set_exception(SystemError())
														
 
															+    def wait_and_raise():
														
 
															+        time.sleep(0.01)
														
 
															+        f2.set_result(123456)
														
 
															+        time.sleep(0.1)
														
 
															+        f1.set_exception(ValueError('we messed up'))
														
 
															+
														
 
															+    p = mp.Process(target=wait_and_raise)
														
 
															+    p.start()
														
 
															+
														
 
															+    with pytest.raises(ValueError):
														
 
															+        # note: it is intended that MPFuture raises Cancel
														
 
															+        await asyncio.gather(f1, f2)
														
 
															+
														
 
															+    p.join()
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+def test_mpfuture_bidirectional():
														
 
															+    evt = mp.Event()
														
 
															+    future_from_main = hivemind.MPFuture()
														
 
															+
														
 
															+    def _future_creator():
														
 
															+        future_from_fork = hivemind.MPFuture()
														
 
															+        future_from_main.set_result(('abc', future_from_fork))
														
 
															+
														
 
															+        if future_from_fork.result() == ['we', 'need', 'to', 'go', 'deeper']:
														
 
															+            evt.set()
														
 
															+
														
 
															+    p = mp.Process(target=_future_creator)
														
 
															+    p.start()
														
 
															+
														
 
															+    out = future_from_main.result()
														
 
															+    assert isinstance(out[1], hivemind.MPFuture)
														
 
															+    out[1].set_result(['we', 'need', 'to', 'go', 'deeper'])
														
 
															+
														
 
															+    p.join()
														
 
															+    assert evt.is_set()
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+def test_mpfuture_done_callback():
														
 
															+    receiver, sender = mp.Pipe(duplex=False)
														
 
															+    events = [mp.Event() for _ in range(5)]
														
 
															+
														
 
															+    def _future_creator():
														
 
															+        future1, future2, future3 = hivemind.MPFuture(), hivemind.MPFuture(), hivemind.MPFuture()
														
 
															+
														
 
															+        def _check_result_and_set(future):
														
 
															+            assert future.done()
														
 
															+            assert future.result() == 123
														
 
															+            events[0].set()
														
 
															+
														
 
															+        future1.add_done_callback(_check_result_and_set)
														
 
															+        future1.add_done_callback(lambda future: events[1].set())
														
 
															+        future2.add_done_callback(lambda future: events[2].set())
														
 
															+        future3.add_done_callback(lambda future: events[3].set())
														
 
															+
														
 
															+        sender.send((future1, future2))
														
 
															+        future2.cancel()  # trigger future2 callback from the same process
														
 
															+
														
 
															+        events[0].wait()
														
 
															+        future1.add_done_callback(lambda future: events[4].set())  # schedule callback after future1 is already finished
														
 
															+
														
 
															+    p = mp.Process(target=_future_creator)
														
 
															+    p.start()
														
 
															+
														
 
															+    future1, future2 = receiver.recv()
														
 
															+    future1.set_result(123)
														
 
															+
														
 
															+    with pytest.raises(RuntimeError):
														
 
															+        future1.add_done_callback(lambda future: (1, 2, 3))
														
 
															+
														
 
															+    p.join()
														
 
															+    events[0].wait(1)
														
 
															+    events[1].wait(1)
														
 
															+    assert future1.done() and not future1.cancelled()
														
 
															+    assert future2.done() and future2.cancelled()
														
 
															+    assert events[0].is_set() and events[1].is_set() and events[2].is_set() and events[4].is_set()
														
 
															+    assert not events[3].is_set()
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+def test_many_futures():
														
 
															+    evt = mp.Event()
														
 
															+    receiver, sender = mp.Pipe()
														
 
															+    main_futures = [hivemind.MPFuture() for _ in range(1000)]
														
 
															+    assert len(hivemind.MPFuture._active_futures) == 1000
														
 
															+
														
 
															+    def _run_peer():
														
 
															+        fork_futures = [hivemind.MPFuture() for _ in range(500)]
														
 
															+        assert len(hivemind.MPFuture._active_futures) == 500
														
 
															+
														
 
															+        for i, future in enumerate(random.sample(main_futures, 300)):
														
 
															+            if random.random() < 0.5:
														
 
															+                future.set_result(i)
														
 
															+            else:
														
 
															+                future.set_exception(ValueError(f"{i}"))
														
 
															+
														
 
															+        sender.send(fork_futures[:-100])
														
 
															+        for future in fork_futures[-100:]:
														
 
															+            future.cancel()
														
 
															+
														
 
															+        evt.wait()
														
 
															+
														
 
															+        assert len(hivemind.MPFuture._active_futures) == 200
														
 
															+        for future in fork_futures:
														
 
															+            future.cancel()
														
 
															+        assert len(hivemind.MPFuture._active_futures) == 0
														
 
															+
														
 
															+    p = mp.Process(target=_run_peer)
														
 
															+    p.start()
														
 
															+
														
 
															+    some_fork_futures = receiver.recv()
														
 
															+    assert len(hivemind.MPFuture._active_futures) == 700
														
 
															+
														
 
															+    for future in some_fork_futures:
														
 
															+        future.set_running_or_notify_cancel()
														
 
															+    for future in random.sample(some_fork_futures, 200):
														
 
															+        future.set_result(321)
														
 
															-    asyncio.create_task(wait_and_raise())
														
 
															-    for future in [f1, f2]:
														
 
															-        with pytest.raises(SystemError):
														
 
															-            await future
														
 
															+    time.sleep(0.5)
														
 
															+    evt.set()
														
 
															+    for future in main_futures:
														
 
															+        future.cancel()
														
 
															+    assert len(hivemind.MPFuture._active_futures) == 0
														
 
															+    p.join()
														
 
															 def test_tensor_compression(size=(128, 128, 64), alpha=5e-08, beta=0.0008):
														
@@ -139,7 +320,7 @@ def test_tensor_compression(size=(128, 128, 64), alpha=5e-08, beta=0.0008):
 
															     error = deserialize_torch_tensor(serialize_torch_tensor(X, CompressionType.UNIFORM_8BIT)) - X
														
 
															     assert error.square().mean() < beta
														
 
															-    zeros = torch.zeros(5,5)
														
 
															+    zeros = torch.zeros(5, 5)
														
 
															     for compression_type in CompressionType.values():
														
 
															         assert deserialize_torch_tensor(serialize_torch_tensor(zeros, compression_type)).isfinite().all()