4 years ago · 11db5fd56f
--- a/examples/albert/README.md
+++ b/examples/albert/README.md
@@ -28,7 +28,8 @@ Run the first DHT peer to welcome trainers and record training statistics (e.g.,
 
															 ```
														
 
															 $ python run_training_monitor.py --experiment_prefix my-albert-v1 --wandb_project Demo-run
														
 
															-[2021/06/17 16:26:36.083][INFO][root.log_visible_maddrs:42] Running a DHT peer. To connect other peers to this one, use --initial_peers /ip4/8.8.8.8/tcp/1337/p2p/XXXX /ip4/8.8.8.8/udp/31337/quic/p2p/XXXX
														
 
															+[2021/06/17 16:26:36.083][INFO][root.log_visible_maddrs:54] Running a DHT peer. To connect other peers to this one over the Internet, 
														
 
															+use --initial_peers /ip4/1.2.3.4/tcp/1337/p2p/XXXX /ip4/1.2.3.4/udp/31337/quic/p2p/XXXX
														
 
															 wandb: Currently logged in as: XXX (use `wandb login --relogin` to force relogin)
														
 
															 wandb: Tracking run with wandb version 0.10.32
														
 
															 wandb: Syncing run dry-mountain-2
														
@@ -61,9 +62,9 @@ To join the collaboration with a GPU trainer,
 
															   Here, `ONE_OR_MORE_PEERS` stands for multiaddresses of one or multiple existing peers (training monitors or existing
														
 
															   trainers)
														
 
															-  collected from the first lines of their terminal output. For the example above, the multiaddresses would be:
														
 
															+  collected from the first lines of their terminal output. For the example above, the (dummy) multiaddresses would be:
														
 
															   ```
														
 
															-  --initial_peers /ip4/8.8.8.8/tcp/1337/p2p/XXXX /ip4/8.8.8.8/udp/31337/quic/p2p/XXXX
														
 
															+  --initial_peers /ip4/1.2.3.4/tcp/1337/p2p/XXXX /ip4/1.2.3.4/udp/31337/quic/p2p/XXXX
														
 
															   ```
														
 
															   <details>
														
--- a/examples/albert/arguments.py
+++ b/examples/albert/arguments.py
@@ -69,7 +69,7 @@ class AveragerArguments:
 
															     )
														
 
															     target_group_size: int = field(default=256, metadata={"help": "Maximum group size for all-reduce"})
														
 
															     metadata_expiration: float = field(
														
 
															-        default=30, metadata={"help": "Peer's metadata will be removed if not updated in this many seconds"}
														
 
															+        default=120, metadata={"help": "Peer's metadata will be removed if not updated in this many seconds"}
														
 
															     )
														
@@ -101,6 +101,9 @@ class CollaborationArguments(CollaborativeOptimizerArguments, BaseTrainingArgume
 
															     statistics_expiration: float = field(
														
 
															         default=600, metadata={"help": "Statistics will be removed if not updated in this many seconds"}
														
 
															     )
														
 
															+    backup_every_steps: int = field(
														
 
															+        default=10, metadata={"help": "In case of NaN, training restore from a backup updated with this frequency."}
														
 
															+    )
														
 
															 @dataclass
														
--- a/examples/albert/run_trainer.py
+++ b/examples/albert/run_trainer.py
@@ -2,9 +2,10 @@
 
															 import logging
														
 
															 import os
														
 
															+import pickle
														
 
															 from dataclasses import asdict
														
 
															 from pathlib import Path
														
 
															-from typing import Dict, Any
														
 
															+from typing import Any
														
 
															 import torch
														
 
															 import transformers
														
@@ -18,6 +19,8 @@ from transformers.trainer import Trainer
 
															 from transformers.trainer_utils import is_main_process
														
 
															 import hivemind
														
 
															+from hivemind.utils.compression import CompressionType
														
 
															+
														
 
															 import utils
														
 
															 from arguments import CollaborationArguments, DatasetArguments, AlbertTrainingArguments, AveragerArguments
														
@@ -93,6 +96,11 @@ def get_optimizer_and_scheduler(training_args, model):
 
															 class CollaborativeCallback(transformers.TrainerCallback):
														
 
															+    """
														
 
															+    This callback monitors and reports collaborative training progress,
														
 
															+    In case of a catastrophic failure, it can also revert training to a backup
														
 
															+    """
														
 
															+
														
 
															     def __init__(
														
 
															         self,
														
 
															         dht: hivemind.DHT,
														
@@ -100,6 +108,7 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															         model: torch.nn.Module,
														
 
															         local_public_key: bytes,
														
 
															         statistics_expiration: float,
														
 
															+        backup_every_steps: int,
														
 
															     ):
														
 
															         super().__init__()
														
 
															         self.model = model
														
@@ -107,11 +116,12 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															         self.local_public_key = local_public_key
														
 
															         self.statistics_expiration = statistics_expiration
														
 
															         self.last_reported_collaboration_step = -1
														
 
															-        self.previous_state = self.get_current_state()
														
 
															         self.samples = 0
														
 
															         self.steps = 0
														
 
															         self.loss = 0
														
 
															         self.total_samples_processed = 0
														
 
															+        self.backup_every_steps = backup_every_steps
														
 
															+        self.latest_backup = self.backup_state()
														
 
															     def on_train_begin(
														
 
															         self, args: TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs
														
@@ -124,9 +134,8 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															     ):
														
 
															         control.should_log = True
														
 
															         if not self.params_are_finite():
														
 
															-            self.load_from_state(self.previous_state)
														
 
															+            self.restore_from_backup(self.latest_backup)
														
 
															             return control
														
 
															-        self.previous_state = self.get_current_state()
														
 
															         if state.log_history:
														
 
															             self.loss += state.log_history[-1]["loss"]
														
@@ -146,6 +155,8 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															                 logger.info(f"Your current contribution: {self.total_samples_processed} samples")
														
 
															                 if self.steps:
														
 
															                     logger.info(f"Local loss: {self.loss / self.steps}")
														
 
															+                if self.collaborative_optimizer.local_step % self.backup_every_steps == 0:
														
 
															+                    self.latest_backup = self.backup_state()
														
 
															                 self.loss = 0
														
 
															                 self.steps = 0
														
@@ -162,15 +173,6 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															         return control
														
 
															-    @torch.no_grad()
														
 
															-    def get_current_state(self) -> Dict[str, Any]:
														
 
															-        return {"model": self.model.state_dict(), "opt": self.collaborative_optimizer.opt.state_dict()}
														
 
															-
														
 
															-    @torch.no_grad()
														
 
															-    def load_from_state(self, state):
														
 
															-        self.model.load_state_dict(state["model"])
														
 
															-        self.collaborative_optimizer.opt.load_state_dict(state["opt"])
														
 
															-
														
 
															     @torch.no_grad()
														
 
															     def params_are_finite(self):
														
 
															         for param in self.model.parameters():
														
@@ -178,6 +180,18 @@ class CollaborativeCallback(transformers.TrainerCallback):
 
															                 return False
														
 
															         return True
														
 
															+    @torch.no_grad()
														
 
															+    def backup_state(self) -> Any:
														
 
															+        return pickle.dumps(
														
 
															+            {"model": self.model.state_dict(), "training": self.collaborative_optimizer.opt.state_dict()}
														
 
															+        )
														
 
															+
														
 
															+    @torch.no_grad()
														
 
															+    def restore_from_backup(self, backup):
														
 
															+        state = pickle.loads(backup)
														
 
															+        self.model.load_state_dict(state["model"])
														
 
															+        self.collaborative_optimizer.opt.load_state_dict(state["training"])
														
 
															+
														
 
															 class NoOpScheduler(LRSchedulerBase):
														
 
															     """Dummy scheduler for transformers.Trainer. The real scheduler is defined in CollaborativeOptimizer.scheduler"""
														
@@ -229,7 +243,7 @@ def main():
 
															     dht = hivemind.DHT(
														
 
															         start=True,
														
 
															         initial_peers=collaboration_args.initial_peers,
														
 
															-        listen=not collaboration_args.client_mode,
														
 
															+        client_mode=collaboration_args.client_mode,
														
 
															         record_validators=validators,
														
 
															         use_ipfs=collaboration_args.use_ipfs,
														
 
															         host_maddrs=collaboration_args.host_maddrs,
														
@@ -248,9 +262,9 @@ def main():
 
															         dht=dht,
														
 
															         scheduler=scheduler,
														
 
															         prefix=collaboration_args.experiment_prefix,
														
 
															-        compression_type=hivemind.utils.CompressionType.Value(collaboration_args.compression),
														
 
															+        compression_type=CompressionType.Value(collaboration_args.compression),
														
 
															         batch_size_per_step=total_batch_size_per_step,
														
 
															-        throughput=collaboration_args.bandwidth,
														
 
															+        bandwidth=collaboration_args.bandwidth,
														
 
															         target_batch_size=adjusted_target_batch_size,
														
 
															         client_mode=collaboration_args.client_mode,
														
 
															         verbose=True,
														
@@ -274,7 +288,12 @@ def main():
 
															         optimizers=(collaborative_optimizer, NoOpScheduler(collaborative_optimizer)),
														
 
															         callbacks=[
														
 
															             CollaborativeCallback(
														
 
															-                dht, collaborative_optimizer, model, local_public_key, collaboration_args.statistics_expiration
														
 
															+                dht,
														
 
															+                collaborative_optimizer,
														
 
															+                model,
														
 
															+                local_public_key,
														
 
															+                collaboration_args.statistics_expiration,
														
 
															+                collaboration_args.backup_every_steps,
														
 
															             )
														
 
															         ],
														
 
															     )
														
--- a/examples/albert/run_training_monitor.py
+++ b/examples/albert/run_training_monitor.py
@@ -13,6 +13,8 @@ from torch_optimizer import Lamb
 
															 from transformers import AlbertForPreTraining, AlbertConfig, HfArgumentParser
														
 
															 import hivemind
														
 
															+from hivemind.utils.compression import CompressionType
														
 
															+
														
 
															 import utils
														
 
															 from arguments import BaseTrainingArguments, CollaborativeOptimizerArguments, AveragerArguments
														
@@ -99,8 +101,8 @@ class CheckpointHandler:
 
															             opt=opt,
														
 
															             dht=dht,
														
 
															             prefix=experiment_prefix,
														
 
															-            compression_type=hivemind.utils.CompressionType.Value(collab_optimizer_args.compression),
														
 
															-            throughput=collab_optimizer_args.bandwidth,
														
 
															+            compression_type=CompressionType.Value(collab_optimizer_args.compression),
														
 
															+            bandwidth=collab_optimizer_args.bandwidth,
														
 
															             target_batch_size=adjusted_target_batch_size,
														
 
															             client_mode=collab_optimizer_args.client_mode,
														
 
															             verbose=True,
														
--- a/examples/albert/utils.py
+++ b/examples/albert/utils.py
@@ -3,6 +3,7 @@ from typing import Dict, List, Tuple
 
															 from multiaddr import Multiaddr
														
 
															 from pydantic import BaseModel, StrictFloat, confloat, conint
														
 
															+from hivemind import choose_ip_address
														
 
															 from hivemind.dht.crypto import RSASignatureValidator
														
 
															 from hivemind.dht.schema import BytesWithPublicKey, SchemaValidator
														
 
															 from hivemind.dht.validation import RecordValidatorBase
														
@@ -41,8 +42,17 @@ def log_visible_maddrs(visible_maddrs: List[Multiaddr], only_p2p: bool) -> None:
 
															         unique_addrs = {addr["p2p"] for addr in visible_maddrs}
														
 
															         initial_peers_str = " ".join(f"/p2p/{addr}" for addr in unique_addrs)
														
 
															     else:
														
 
															-        initial_peers_str = " ".join(str(addr) for addr in visible_maddrs)
														
 
															+        available_ips = [Multiaddr(addr) for addr in visible_maddrs if "ip4" in addr]
														
 
															+        available_ips += [Multiaddr(addr) for addr in visible_maddrs if "ip6" in addr]
														
 
															+        if available_ips:
														
 
															+            preferred_ip = choose_ip_address(available_ips)
														
 
															+            selected_maddrs = [addr for addr in visible_maddrs if preferred_ip in str(addr)]
														
 
															+        else:
														
 
															+            selected_maddrs = visible_maddrs
														
 
															+        initial_peers_str = " ".join(str(addr) for addr in selected_maddrs)
														
 
															+
														
 
															     logger.info(
														
 
															-        f"Running a DHT peer. To connect other peers to this one, use "
														
 
															+        f"Running a DHT peer. To connect other peers to this one over the Internet, use "
														
 
															         f"{TextStyle.BOLD}{TextStyle.BLUE}--initial_peers {initial_peers_str}{TextStyle.RESET}"
														
 
															     )
														
 
															+    logger.info(f"Full list of visible multiaddresses: {' '.join(str(addr) for addr in visible_maddrs)}")
														
--- a/hivemind/averaging/averager.py
+++ b/hivemind/averaging/averager.py
@@ -31,7 +31,7 @@ from hivemind.utils import Endpoint, Port, MPFuture, get_logger, TensorDescripto
 
															 from hivemind.utils.asyncio import anext, achain, aiter, switch_to_uvloop
														
 
															 from hivemind.utils.compression import serialize_torch_tensor, deserialize_torch_tensor
														
 
															 from hivemind.utils.grpc import ChannelCache, GRPC_KEEPALIVE_OPTIONS, split_for_streaming, combine_from_streaming
														
 
															-from hivemind.utils.networking import choose_ip_address, strip_port
														
 
															+from hivemind.utils.networking import choose_ip_address, strip_port, Hostname
														
 
															 from hivemind.utils.serializer import MSGPackSerializer, SerializerBase
														
 
															 from hivemind.utils.timed_storage import get_dht_time, ValueWithExpiration, DHTExpiration
														
@@ -64,11 +64,11 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     :param request_timeout: when looking for group, wait for a response from leader for at most this many seconds.
														
 
															     :note: request_timeout must be smaller than averaging_expiration to avoid potential deadlocks.
														
 
															     :param part_size_bytes: tensors for AllReduce are processed in parts of up to this size (after compression)
														
 
															-    :param throughput: if specified, this value represents the network bandwidth available to averager.
														
 
															+    :param bandwidth: if specified, this value represents the network bandwidth available to averager.
														
 
															           By default, the averager is assumed to have the average bandwidth of his group.
														
 
															-          If throughput == 0, averager will rely on its groupmates to do all the averaging.
														
 
															-    :param listen: if True (default), this averager will accept incoming requests from other peers and perform allreduce
														
 
															-            if False, the averager will register as a freeloader and attempt to fetch vectors from other averagers
														
 
															+          If bandwidth == 0, averager will rely on its groupmates to do all the averaging.
														
 
															+    :param client_mode: if False (default), this averager will accept incoming requests from other peers
														
 
															+            if True, the averager will only join existing groups where at least one peer has client_mode=False
														
 
															     :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
														
 
															     :param announced_host: visible IP address the averager will announce for external connections from other peers.
														
 
															           If None, the address will be chosen from p2p.get_visible_maddrs() (global IPv4 addresses are preferred)
														
@@ -115,11 +115,11 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         part_size_bytes: int = DEFAULT_PART_SIZE_BYTES,
														
 
															         allreduce_timeout: Optional[float] = None,
														
 
															         compression_type: runtime_pb2.CompressionType = runtime_pb2.CompressionType.NONE,
														
 
															-        throughput: Optional[float] = None,
														
 
															+        bandwidth: Optional[float] = None,
														
 
															         min_vector_size: int = 0,
														
 
															         auxiliary: bool = False,
														
 
															         allow_state_sharing: Optional[bool] = None,
														
 
															-        listen: bool = True,
														
 
															+        client_mode: bool = False,
														
 
															         listen_on: Endpoint = "0.0.0.0:*",
														
 
															         daemon: bool = True,
														
 
															         announced_host: Optional[str] = None,
														
@@ -128,18 +128,19 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         **kwargs,
														
 
															     ):
														
 
															         assert "." not in prefix, "group prefix must be a string without trailing '.'"
														
 
															-        assert throughput is None or (
														
 
															-            throughput >= 0 and np.isfinite(np.float32(throughput))
														
 
															-        ), "throughput must be a non-negative float32"
														
 
															+        assert bandwidth is None or (
														
 
															+            bandwidth >= 0 and np.isfinite(np.float32(bandwidth))
														
 
															+        ), "bandwidth must be a non-negative float32"
														
 
															         if not is_power_of_two(target_group_size):
														
 
															             logger.warning("It is recommended to set target_group_size to a power of 2.")
														
 
															         assert initial_group_bits is None or all(bit in "01" for bit in initial_group_bits)
														
 
															-        assert listen or not auxiliary, "auxiliary peers must accept incoming connections"
														
 
															+        assert not client_mode or not auxiliary, "auxiliary peers must accept incoming connections"
														
 
															         super().__init__()
														
 
															         self.dht = dht
														
 
															-        self.listen, self.listen_on, self.kwargs = listen, listen_on, kwargs
														
 
															-        if not self.listen:
														
 
															+        self.client_mode, self.listen_on, self.kwargs = client_mode, listen_on, kwargs
														
 
															+        self._parent_pid = os.getpid()
														
 
															+        if self.client_mode:
														
 
															             self.mode = AveragingMode.CLIENT
														
 
															         elif auxiliary:
														
 
															             self.mode = AveragingMode.AUX
														
@@ -161,7 +162,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         self.total_size = sum(map(torch.Tensor.numel, self._averaged_tensors))
														
 
															         self.schema_hash = compute_schema_hash(self._averaged_tensors)
														
 
															         self.shutdown_timeout = shutdown_timeout
														
 
															-        self.throughput = throughput
														
 
															+        self.bandwidth = bandwidth
														
 
															         self.matchmaking_kwargs = dict(
														
 
															             prefix=prefix,
														
@@ -181,10 +182,12 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         self._port = mp.Value(ctypes.c_uint32, 0)  # assigned when averager starts, accessible via self.port
														
 
															         self._allow_state_sharing = mp.Value(ctypes.c_bool, 0)
														
 
															-        self.allow_state_sharing = (listen and not auxiliary) if allow_state_sharing is None else allow_state_sharing
														
 
															+        if allow_state_sharing is None:
														
 
															+            allow_state_sharing = not client_mode and not auxiliary
														
 
															+        self.allow_state_sharing = allow_state_sharing
														
 
															         self._averager_endpoint: Optional[Endpoint] = None
														
 
															-        if not self.listen:
														
 
															+        if self.client_mode:
														
 
															             self._averager_endpoint = f"client::{uuid.uuid4()}"
														
 
															         self.ready = mp.Event()  # whether the averager process has started (and ready for incoming requests)
														
@@ -221,16 +224,14 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     @allow_state_sharing.setter
														
 
															     def allow_state_sharing(self, value: bool):
														
 
															-        if value is True and not self.listen:
														
 
															-            logger.warning(
														
 
															-                "Cannot allow state sharing: averager in client mode (listen=False) cannot share its state."
														
 
															-            )
														
 
															+        if value and self.client_mode:
														
 
															+            raise ValueError("Cannot allow state sharing: averager in client mode cannot share its state.")
														
 
															         else:
														
 
															             self._allow_state_sharing.value = value
														
 
															     @property
														
 
															     def endpoint(self) -> Optional[Endpoint]:
														
 
															-        if self.listen and self._averager_endpoint is None:
														
 
															+        if self._averager_endpoint is None and not self.client_mode:
														
 
															             assert self.port is not None, "Averager is not running yet"
														
 
															             self._averager_endpoint = f"{self.announced_host}:{self.port}"
														
 
															             logger.debug(f"Assuming averager endpoint to be {self._averager_endpoint}")
														
@@ -258,7 +259,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															             async def _run():
														
 
															                 grpc.aio.init_grpc_aio()
														
 
															-                if self.listen:
														
 
															+                if not self.client_mode:
														
 
															                     self._server = grpc.aio.server(**self.kwargs, options=GRPC_KEEPALIVE_OPTIONS)
														
 
															                     averaging_pb2_grpc.add_DecentralizedAveragingServicer_to_server(self, self._server)
														
 
															                     found_port = self._server.add_insecure_port(self.listen_on)
														
@@ -269,9 +270,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															                     logger.debug(f"The averager is running in client mode.")
														
 
															                 self._matchmaking = Matchmaking(
														
 
															-                    self.endpoint, self.schema_hash, self.dht, **self.matchmaking_kwargs, client_mode=not self.listen
														
 
															+                    self.endpoint, self.schema_hash, self.dht, **self.matchmaking_kwargs, client_mode=self.client_mode
														
 
															                 )
														
 
															-                if self.listen:
														
 
															+                if not self.client_mode:
														
 
															                     asyncio.create_task(self._declare_for_download_periodically())
														
 
															                 self._pending_group_assembled = asyncio.Event()
														
@@ -312,7 +313,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         remaining_tasks = set()
														
 
															         for group in self._running_groups.values():
														
 
															             remaining_tasks.update(group.finalize(cancel=True))
														
 
															-        if self.listen:
														
 
															+        if not self.client_mode:
														
 
															             remaining_tasks.add(self._server.stop(timeout))
														
 
															         await asyncio.gather(*remaining_tasks)
														
@@ -374,7 +375,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															             while not future.done():
														
 
															                 try:
														
 
															                     self._pending_group_assembled.clear()
														
 
															-                    data_for_gather = self.serializer.dumps([weight, self.throughput, self.mode.value, gather_binary])
														
 
															+                    data_for_gather = self.serializer.dumps([weight, self.bandwidth, self.mode.value, gather_binary])
														
 
															                     group_info = await self._matchmaking.look_for_group(
														
 
															                         timeout=timeout, data_for_gather=data_for_gather
														
 
															                     )
														
@@ -422,16 +423,16 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     async def _run_allreduce(self, group_info: GroupInfo, min_vector_size: int, **kwargs) -> GatheredData:
														
 
															         """Run All-Reduce in a given group and update tensors in place, return gathered metadata"""
														
 
															         try:
														
 
															-            weights, throughputs, mode_ids, user_gathered = zip(*map(self.serializer.loads, group_info.gathered))
														
 
															+            weights, bandwidths, mode_ids, user_gathered = zip(*map(self.serializer.loads, group_info.gathered))
														
 
															             user_gathered = dict(zip(group_info.endpoints, map(self.serializer.loads, user_gathered)))
														
 
															             modes = tuple(map(AveragingMode, mode_ids))
														
 
															-            # compute optimal part sizes from peer throughputs; TODO: replace with proper load balancing
														
 
															-            incoming_throughputs = [
														
 
															-                thr if mode != AveragingMode.CLIENT else 0.0 for thr, mode in zip(throughputs, modes)
														
 
															+            # compute optimal part sizes from peer bandwidths; TODO: replace with proper load balancing
														
 
															+            download_bandwidths = [
														
 
															+                thr if mode != AveragingMode.CLIENT else 0.0 for thr, mode in zip(bandwidths, modes)
														
 
															             ]
														
 
															             peer_fractions = await asyncio.get_event_loop().run_in_executor(
														
 
															-                None, load_balance_peers, self.total_size, incoming_throughputs, min_vector_size
														
 
															+                None, load_balance_peers, self.total_size, download_bandwidths, min_vector_size
														
 
															             )
														
 
															             async with self.get_tensors_async() as local_tensors:
														
--- a/hivemind/averaging/load_balancing.py
+++ b/hivemind/averaging/load_balancing.py
@@ -9,30 +9,30 @@ logger = get_logger(__name__)
 
															 LOAD_BALANCING_LP_DECIMALS = 9
														
 
															-def load_balance_peers(vector_size, throughputs: Sequence[Optional[float]], min_size: int = 0) -> Tuple[int, ...]:
														
 
															+def load_balance_peers(vector_size, bandwidths: Sequence[Optional[float]], min_size: int = 0) -> Tuple[int, ...]:
														
 
															     """
														
 
															-    Find an optimal partitioning of weights for butterfly all-reduce given peer throughputs.
														
 
															+    Find an optimal partitioning of weights for butterfly all-reduce given peer bandwidths.
														
 
															     :param vector_size: total size of the averaged vector (in elements, not bytes)
														
 
															-    :param throughputs: 1d array of non-negative throughputs for each peer capable of averaging
														
 
															+    :param bandwidths: 1d array of non-negative bandwidths for each peer capable of averaging
														
 
															       zeros stand for client-only participants, None represents "not specified" (resolved as mean of other pears)
														
 
															     :param min_size: peers that can aggregate less than this many elements will be assigned nothing
														
 
															     :returns: an integer array where i-th element is the number of weights assigned to i-th peer
														
 
															     """
														
 
															-    specified_throughputs = [throughput for throughput in throughputs if throughput is not None and throughput > 0]
														
 
															+    specified_bandwidth = [item for item in bandwidths if item is not None and item > 0]
														
 
															-    if specified_throughputs:
														
 
															-        default_throughput = np.mean(specified_throughputs)
														
 
															-        throughputs = [throughput if throughput is not None else default_throughput for throughput in throughputs]
														
 
															-        scores = optimize_parts_lp(vector_size, np.asarray(throughputs), min_size)
														
 
															+    if specified_bandwidth:
														
 
															+        default_bandwidth = np.mean(specified_bandwidth)
														
 
															+        bandwidths = [item if item is not None else default_bandwidth for item in bandwidths]
														
 
															+        scores = optimize_parts_lp(vector_size, np.asarray(bandwidths), min_size)
														
 
															     else:
														
 
															-        assert not all(throughput == 0 for throughput in throughputs), "Must have at least one nonzero throughput"
														
 
															-        scores = np.asarray([1.0 if throughput is None else 0.0 for throughput in throughputs])
														
 
															+        assert not all(item == 0 for item in bandwidths), "Must have at least one nonzero bandwidth"
														
 
															+        scores = np.asarray([1.0 if item is None else 0.0 for item in bandwidths])
														
 
															     # TODO(jheuristic) we no longer need hagenbach-bishoff with new AllReduceRunner
														
 
															     return tuple(hagenbach_bishoff(vector_size, scores))
														
 
															-def optimize_parts_lp(vector_size: int, throughputs: np.ndarray, min_size: int = 0) -> np.ndarray:
														
 
															+def optimize_parts_lp(vector_size: int, bandwidths: np.ndarray, min_size: int = 0) -> np.ndarray:
														
 
															     """
														
 
															     This method solves an optimization problem to minimize the total allreduce time.
														
 
															     In butterfly all-reduce, each peer acts both as a "client" and as an "aggregator":
														
@@ -42,20 +42,20 @@ def optimize_parts_lp(vector_size: int, throughputs: np.ndarray, min_size: int =
 
															     Peer i network load as a "client" = vector_size * (1 - fraction_assigned_to_peer_i)
														
 
															     Peer i network load as an "aggregator" = vector_size * (group_size - 1) * fraction_assigned_to_peer_i
														
 
															     Peer i total communication = vector_size * [1 + (group_size - 2) * fraction_assigned_to_peer_i]
														
 
															-    Total time = max_i (total_communication_for_peer_i / throughputs[i])
														
 
															+    Total time = max_i (total_communication_for_peer_i / bandwidths[i])
														
 
															     We solve this optimization problem by reducing it to linear programming with a minimax reduction
														
 
															     (see lecture notes: https://www.usna.edu/Users/math/dphillip/sa305.s15/phillips/lessons/32/32.pdf )
														
 
															     :returns: a vector of "scores", i-th score is proportional to the fraction of weights assigned to i-th peer
														
 
															     """
														
 
															-    assert np.all(throughputs >= 0) and np.any(throughputs > 0)
														
 
															-    throughputs = np.asarray(throughputs, dtype=np.float64)
														
 
															-    permutation = np.argsort(-throughputs)
														
 
															-    throughputs = throughputs[permutation]
														
 
															-    is_nonzero = throughputs != 0
														
 
															+    assert np.all(bandwidths >= 0) and np.any(bandwidths > 0)
														
 
															+    bandwidths = np.asarray(bandwidths, dtype=np.float64)
														
 
															+    permutation = np.argsort(-bandwidths)
														
 
															+    bandwidths = bandwidths[permutation]
														
 
															+    is_nonzero = bandwidths != 0
														
 
															-    group_size = len(throughputs)
														
 
															+    group_size = len(bandwidths)
														
 
															     num_variables = group_size + 1  # [w_1, ..., w_N, xi]
														
 
															     c = np.zeros(num_variables, dtype=np.float64)
														
@@ -64,9 +64,9 @@ def optimize_parts_lp(vector_size: int, throughputs: np.ndarray, min_size: int =
 
															     # the constraints below are tuples (A, b) such that Ax <= b
														
 
															     nonnegative_weights = -np.eye(group_size, num_variables, dtype=c.dtype), np.zeros(group_size, c.dtype)
														
 
															     weights_sum_to_one = c[None, :] - 1.0, np.array([-1.0])
														
 
															-    coeff_per_variable = (group_size - 2.0) / np.maximum(throughputs, 10 ** -LOAD_BALANCING_LP_DECIMALS)
														
 
															+    coeff_per_variable = (group_size - 2.0) / np.maximum(bandwidths, 10 ** -LOAD_BALANCING_LP_DECIMALS)
														
 
															     coeff_matrix_minus_xi = np.hstack([np.diag(coeff_per_variable), -np.ones((group_size, 1), c.dtype)])
														
 
															-    xi_is_maximum = coeff_matrix_minus_xi[is_nonzero], -1.0 / throughputs[is_nonzero]
														
 
															+    xi_is_maximum = coeff_matrix_minus_xi[is_nonzero], -1.0 / bandwidths[is_nonzero]
														
 
															     force_max_weights = np.eye(group_size, M=num_variables, dtype=c.dtype), is_nonzero.astype(c.dtype)
														
 
															     A, b = list(map(np.concatenate, zip(nonnegative_weights, weights_sum_to_one, xi_is_maximum, force_max_weights)))
														
@@ -79,7 +79,7 @@ def optimize_parts_lp(vector_size: int, throughputs: np.ndarray, min_size: int =
 
															             peer_scores[peer_scores < min_size / float(vector_size)] = 0.0
														
 
															         peer_scores = np.round(peer_scores, LOAD_BALANCING_LP_DECIMALS)
														
 
															     else:
														
 
															-        logger.error(f"Failed to solve load-balancing for bandwidths {throughputs}.")
														
 
															+        logger.error(f"Failed to solve load-balancing for bandwidths {bandwidths}.")
														
 
															         peer_scores = np.ones(group_size, c.dtype)
														
 
															     return peer_scores[np.argsort(permutation)]
														
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -26,7 +26,6 @@ from multiaddr import Multiaddr
 
															 from hivemind.dht.node import DHTID, DHTNode
														
 
															 from hivemind.dht.routing import DHTKey, DHTValue, Subkey
														
 
															 from hivemind.dht.validation import CompositeValidator, RecordValidatorBase
														
 
															-from hivemind.p2p import P2P
														
 
															 from hivemind.utils import DHTExpiration, MPFuture, ValueWithExpiration, await_cancelled, get_logger, switch_to_uvloop
														
 
															 logger = get_logger(__name__)
														
@@ -40,9 +39,6 @@ class DHT(mp.Process):
 
															     * hivemind servers periodically announce their experts via declare_experts (dht_handler.py)
														
 
															     * trainers find most suitable experts via RemoteMixtureOfExperts (beam_search.py)
														
 
															-    :param p2p: instance of hivemind.p2p.P2P that will be used for communication.
														
 
															-      If None, DHTNode will create and manage its own P2P instance with given initial_peers and
														
 
															-      parameters from ``kwargs``
														
 
															     :param initial_peers: multiaddrs of one or more active DHT peers (if you want to join an existing DHT)
														
 
															     :param start: if True, automatically starts the background process on creation. Otherwise await manual start
														
 
															     :param daemon: if True, the background process is marked as daemon and automatically terminated after main process
														
@@ -60,7 +56,6 @@ class DHT(mp.Process):
 
															     def __init__(
														
 
															         self,
														
 
															-        p2p: Optional[P2P] = None,
														
 
															         initial_peers: Optional[Sequence[Union[Multiaddr, str]]] = None,
														
 
															         *,
														
 
															         start: bool,
														
@@ -70,9 +65,9 @@ class DHT(mp.Process):
 
															         shutdown_timeout: float = 3,
														
 
															         **kwargs,
														
 
															     ):
														
 
															+        self._parent_pid = os.getpid()
														
 
															         super().__init__()
														
 
															-        self.p2p = p2p
														
 
															         if not (
														
 
															             initial_peers is None
														
 
															             or (
														
@@ -101,7 +96,6 @@ class DHT(mp.Process):
 
															             async def _run():
														
 
															                 self._node = await DHTNode.create(
														
 
															-                    p2p=self.p2p,
														
 
															                     initial_peers=self.initial_peers,
														
 
															                     num_workers=self.max_workers or 1,
														
 
															                     record_validator=self._record_validator,
														
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -114,7 +114,7 @@ class DHTNode:
 
															         chunk_size: int = 16,
														
 
															         blacklist_time: float = 5.0,
														
 
															         backoff_rate: float = 2.0,
														
 
															-        listen: bool = True,
														
 
															+        client_mode: bool = False,
														
 
															         record_validator: Optional[RecordValidatorBase] = None,
														
 
															         authorizer: Optional[AuthorizerBase] = None,
														
 
															         validate: bool = True,
														
@@ -154,8 +154,8 @@ class DHTNode:
 
															         :param backoff_rate: blacklist time will be multiplied by :backoff_rate: for each successive non-response
														
 
															         :param validate: if True, use initial peers to validate that this node is accessible and synchronized
														
 
															         :param strict: if True, any error encountered in validation will interrupt the creation of DHTNode
														
 
															-        :param listen: if True (default), this node will accept incoming request and otherwise be a DHT "citzen"
														
 
															-          if False, this node will refuse any incoming request, effectively being only a "client"
														
 
															+        :param client_mode: if False (default), this node will accept incoming requests as a full DHT "citzen"
														
 
															+          if True, this node will refuse any incoming requests, effectively being only a client
														
 
															         :param record_validator: instance of RecordValidatorBase used for signing and validating stored records
														
 
															         :param authorizer: instance of AuthorizerBase used for signing and validating requests and response
														
 
															           for a given authorization protocol
														
@@ -203,7 +203,7 @@ class DHTNode:
 
															             wait_timeout,
														
 
															             parallel_rpc,
														
 
															             cache_size,
														
 
															-            listen,
														
 
															+            client_mode,
														
 
															             record_validator,
														
 
															             authorizer,
														
 
															         )
														
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -43,7 +43,7 @@ class DHTProtocol(ServicerBase):
 
															         wait_timeout: float,
														
 
															         parallel_rpc: Optional[int] = None,
														
 
															         cache_size: Optional[int] = None,
														
 
															-        listen=True,
														
 
															+        client_mode: bool = False,
														
 
															         record_validator: Optional[RecordValidatorBase] = None,
														
 
															         authorizer: Optional[AuthorizerBase] = None,
														
 
															     ) -> DHTProtocol:
														
@@ -66,15 +66,15 @@ class DHTProtocol(ServicerBase):
 
															         self.storage, self.cache = DHTLocalStorage(), DHTLocalStorage(maxsize=cache_size)
														
 
															         self.routing_table = RoutingTable(node_id, bucket_size, depth_modulo)
														
 
															         self.rpc_semaphore = asyncio.Semaphore(parallel_rpc if parallel_rpc is not None else float("inf"))
														
 
															-        self.listen = listen
														
 
															+        self.client_mode = client_mode
														
 
															         self.record_validator = record_validator
														
 
															         self.authorizer = authorizer
														
 
															-        if listen:
														
 
															+        if not client_mode:
														
 
															             await self.add_p2p_handlers(self.p2p, AuthRPCWrapper(self, AuthRole.SERVICER, self.authorizer))
														
 
															             self.node_info = dht_pb2.NodeInfo(node_id=node_id.to_bytes())
														
 
															-        else:  # client-only mode
														
 
															+        else:
														
 
															             # note: use empty node_info so peers won't add you to their routing tables
														
 
															             self.node_info = dht_pb2.NodeInfo()
														
 
															         return self
														
@@ -95,7 +95,7 @@ class DHTProtocol(ServicerBase):
 
															         :param peer: peer ID to ping
														
 
															         :param validate: if True, validates that node's peer_id is available
														
 
															         :param strict: if strict=True, validation will raise exception on fail, otherwise it will only warn
														
 
															-        :note: if DHTProtocol was created with listen=True, also request peer to add you to his routing table
														
 
															+        :note: if DHTProtocol was created with client_mode=False, also request peer to add you to his routing table
														
 
															         :return: node's DHTID, if peer responded and decided to send his node_id
														
 
															         """
														
@@ -112,7 +112,7 @@ class DHTProtocol(ServicerBase):
 
															         if responded and validate:
														
 
															             try:
														
 
															-                if self.listen and not response.available:
														
 
															+                if not self.client_mode and not response.available:
														
 
															                     raise ValidationError(
														
 
															                         f"Peer {peer} can't access this node. " f"Probably, libp2p has failed to bypass the firewall"
														
 
															                     )
														
--- a/hivemind/optim/adaptive.py
+++ b/hivemind/optim/adaptive.py
@@ -29,6 +29,6 @@ class CollaborativeAdaptiveOptimizer(CollaborativeOptimizer):
 
															             average_opt_statistics=average_opt_statistics,
														
 
															             prefix=f"{self.prefix}_averaging",
														
 
															             allreduce_timeout=self.averaging_timeout,
														
 
															-            listen=not self.client_mode,
														
 
															+            client_mode=self.client_mode,
														
 
															             **kwargs,
														
 
															         )
														
--- a/hivemind/optim/collaborative.py
+++ b/hivemind/optim/collaborative.py
@@ -167,7 +167,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
															             average_gradients=True,
														
 
															             prefix=f"{self.prefix}_averaging",
														
 
															             allreduce_timeout=self.averaging_timeout,
														
 
															-            listen=not self.client_mode,
														
 
															+            client_mode=self.client_mode,
														
 
															             **kwargs,
														
 
															         )
														
@@ -359,7 +359,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
															                     samples_accumulated=self.local_samples_accumulated,
														
 
															                     samples_per_second=self.performance_ema.samples_per_second,
														
 
															                     time=current_time,
														
 
															-                    client_mode=not self.averager.listen,
														
 
															+                    client_mode=self.averager.client_mode,
														
 
															                 )
														
 
															             self.dht.store(
														
--- a/hivemind/utils/mpfuture.py
+++ b/hivemind/utils/mpfuture.py
@@ -70,7 +70,7 @@ class MPFuture(base.Future, Generic[ResultType]):
 
															     _status_requests: Optional[Dict[UID, Tuple[MPFuture, threading.Event]]] = None  # futures to be updated by origin
														
 
															     _active_pid: Optional[PID] = None  # pid of currently active process; used to handle forks natively
														
 
															-    SOFT_UPDATE_TIMEOUT = 0.1  # seconds spent awaiting status update before warning is printed
														
 
															+    SOFT_UPDATE_TIMEOUT = 0.5  # seconds spent awaiting status update before warning is printed
														
 
															     HARD_UPDATE_TIMEOUT = 10.0  # seconds spent awaiting status update before future is automatically cancelled
														
 
															     def __init__(self, *, synchronize: bool = True, use_lock: bool = True):
														
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -76,7 +76,7 @@ def _test_allreduce_once(n_clients, n_aux):
 
															             target_group_size=4,
														
 
															             averaging_expiration=15,
														
 
															             prefix="mygroup",
														
 
															-            listen=mode != AveragingMode.CLIENT,
														
 
															+            client_mode=mode == AveragingMode.CLIENT,
														
 
															             listen_on="127.0.0.1:*",
														
 
															             auxiliary=mode == AveragingMode.AUX,
														
 
															             start=True,
														
@@ -121,8 +121,8 @@ def test_allreduce_weighted(n_client_mode_peers: int = 2):
 
															     dht = hivemind.DHT(start=True)
														
 
															     n_peers = 4
														
 
															-    should_listen = [False] * n_client_mode_peers + [True] * (n_peers - n_client_mode_peers)
														
 
															-    random.shuffle(should_listen)
														
 
															+    client_modes = [True] * n_client_mode_peers + [False] * (n_peers - n_client_mode_peers)
														
 
															+    random.shuffle(client_modes)
														
 
															     tensors1 = [torch.randn(123), torch.zeros(3)]
														
 
															     tensors2 = [torch.rand(123), torch.ones(3)]
														
@@ -135,11 +135,11 @@ def test_allreduce_weighted(n_client_mode_peers: int = 2):
 
															             target_group_size=4,
														
 
															             averaging_expiration=15,
														
 
															             prefix="mygroup",
														
 
															-            listen=listen,
														
 
															+            client_mode=client_mode,
														
 
															             listen_on="127.0.0.1:*",
														
 
															             start=True,
														
 
															         )
														
 
															-        for tensors, listen in zip([tensors1, tensors2, tensors3, tensors4], should_listen)
														
 
															+        for tensors, client_mode in zip([tensors1, tensors2, tensors3, tensors4], client_modes)
														
 
															     ]
														
 
															     weights = list(map(float, np.random.rand(len(averagers)) * 10 + 0.01))
														
 
															     reference = [
														
@@ -180,7 +180,7 @@ def test_allreduce_compression():
 
															             [x.clone() for x in tensors1],
														
 
															             dht=dht,
														
 
															             compression_type=compression_type_pair,
														
 
															-            listen=False,
														
 
															+            client_mode=True,
														
 
															             target_group_size=2,
														
 
															             prefix="mygroup",
														
 
															             start=True,
														
@@ -306,16 +306,16 @@ def test_allgather():
 
															     dht.shutdown()
														
 
															-def get_cost(vector_size, partitions, throughputs):
														
 
															+def get_cost(vector_size, partitions, bandwidths):
														
 
															     return max(
														
 
															-        (vector_size - partitions[i] + (len(partitions) - 1) * partitions[i]) / max(throughputs[i], 1e-9)
														
 
															+        (vector_size - partitions[i] + (len(partitions) - 1) * partitions[i]) / max(bandwidths[i], 1e-9)
														
 
															         for i in range(len(partitions))
														
 
															     )
														
 
															-def check_optimality(vector_size, throughputs, ref_partitions):
														
 
															-    partitions = list(load_balance_peers(vector_size, throughputs))
														
 
															-    assert get_cost(vector_size, partitions, throughputs) <= get_cost(vector_size, ref_partitions, throughputs)
														
 
															+def check_optimality(vector_size, bandwidths, ref_partitions):
														
 
															+    partitions = list(load_balance_peers(vector_size, bandwidths))
														
 
															+    assert get_cost(vector_size, partitions, bandwidths) <= get_cost(vector_size, ref_partitions, bandwidths)
														
 
															 @pytest.mark.forked
														
@@ -342,9 +342,9 @@ def test_load_balancing():
 
															         vector_size = np.random.randint(1, 1024 ** 3)
														
 
															         num_peers = np.random.randint(1, 256)
														
 
															         scale = 1e-9 + np.random.rand() * 1e5
														
 
															-        throughputs = np.random.rand(num_peers) * scale + 1e-6
														
 
															+        bandwidths = np.random.rand(num_peers) * scale + 1e-6
														
 
															         min_size = np.random.choice([0, np.random.randint(0, vector_size // 10)])
														
 
															-        assignment = load_balance_peers(vector_size, throughputs, min_size)
														
 
															+        assignment = load_balance_peers(vector_size, bandwidths, min_size)
														
 
															         assert np.sum(assignment) == vector_size
														
 
															         assert np.min(assignment) >= 0
														
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -102,7 +102,7 @@ async def test_dht_get_visible_maddrs():
 
															     dummy_endpoint = Multiaddr("/ip4/123.45.67.89/tcp/31337")
														
 
															     p2p = await hivemind.p2p.P2P.create(announce_maddrs=[dummy_endpoint])
														
 
															-    dht = hivemind.DHT(p2p, start=True)
														
 
															+    dht = hivemind.DHT(start=True, p2p=await p2p.replicate(p2p.daemon_listen_maddr))
														
 
															     assert dht.get_visible_maddrs() == [dummy_endpoint.encapsulate(f"/p2p/{p2p.id}")]
														
 
															     dht.shutdown()
														
--- a/tests/test_dht_node.py
+++ b/tests/test_dht_node.py
@@ -77,11 +77,12 @@ def test_dht_protocol():
 
															     peer2_node_id, peer2_proc, peer2_id, _ = launch_protocol_listener(initial_peers=peer1_maddrs)
														
 
															     loop = asyncio.get_event_loop()
														
 
															-    for listen in [False, True]:  # note: order matters, this test assumes that first run uses listen=False
														
 
															+    for client_mode in [True, False]:  # note: order matters, this test assumes that first run uses client mode
														
 
															+        peer_id = DHTID.generate()
														
 
															         p2p = loop.run_until_complete(P2P.create(initial_peers=peer1_maddrs))
														
 
															         protocol = loop.run_until_complete(
														
 
															             DHTProtocol.create(
														
 
															-                p2p, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5, num_replicas=3, listen=listen
														
 
															+                p2p, peer_id, bucket_size=20, depth_modulo=5, wait_timeout=5, num_replicas=3, client_mode=client_mode
														
 
															             )
														
 
															         )
														
 
															         logger.info(f"Self id={protocol.node_id}")
														
@@ -150,7 +151,7 @@ def test_dht_protocol():
 
															         assert recv_dict.data[subkey1] == (protocol.serializer.dumps(value1), expiration)
														
 
															         assert recv_dict.data[subkey2] == (protocol.serializer.dumps(value2), expiration + 5)
														
 
															-        if listen:
														
 
															+        if not client_mode:
														
 
															             loop.run_until_complete(p2p.shutdown())
														
 
															     peer1_proc.terminate()
														
@@ -166,7 +167,7 @@ def test_empty_table():
 
															     p2p = loop.run_until_complete(P2P.create(initial_peers=peer_maddrs))
														
 
															     protocol = loop.run_until_complete(
														
 
															         DHTProtocol.create(
														
 
															-            p2p, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5, num_replicas=3, listen=False
														
 
															+            p2p, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5, num_replicas=3, client_mode=True
														
 
															         )
														
 
															     )
														
@@ -353,7 +354,7 @@ async def test_dhtnode_caching(T=0.05):
 
															     node1 = await DHTNode.create(
														
 
															         initial_peers=await node2.protocol.p2p.get_visible_maddrs(),
														
 
															         cache_refresh_before_expiry=5 * T,
														
 
															-        listen=False,
														
 
															+        client_mode=True,
														
 
															         reuse_get_requests=False,
														
 
															     )
														
 
															     await node2.store("k", [123, "value"], expiration_time=hivemind.get_dht_time() + 7 * T)