4 years ago · 8466d722da
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
															 from hivemind.server import *
														
 
															 from hivemind.utils import *
														
 
															-__version__ = '0.8.25'
														
 
															+__version__ = '0.8.26'
														
--- a/hivemind/client/averaging/__init__.py
+++ b/hivemind/client/averaging/__init__.py
@@ -12,18 +12,20 @@ from typing import Sequence, Optional, Tuple, Any, Union, Dict, AsyncIterator
 
															 import grpc
														
 
															 import torch
														
 
															+import numpy as np
														
 
															 import hivemind
														
 
															 from hivemind.client.averaging.allreduce import AllReduceRunner, AllreduceException, GroupID
														
 
															 from hivemind.client.averaging.matchmaking import Matchmaking
														
 
															 from hivemind.proto import averaging_pb2, averaging_pb2_grpc, runtime_pb2
														
 
															-from hivemind.utils import get_logger, Endpoint, Port, MPFuture, replace_port, GRPC_KEEPALIVE_OPTIONS, get_dht_time
														
 
															+from hivemind.utils import get_logger, Endpoint, Port, MPFuture, GRPC_KEEPALIVE_OPTIONS, get_dht_time, MSGPackSerializer
														
 
															 from hivemind.utils.asyncio import anext, achain, aiter, switch_to_uvloop
														
 
															 # flavour types
														
 
															 StreamCallToLeader = grpc.aio.UnaryStreamCall[averaging_pb2.JoinRequest, averaging_pb2.MessageFromLeader]
														
 
															 INITIAL_GROUP_NBITS = 3
														
 
															+DataForGather = Any
														
 
															 logger = get_logger(__name__)
														
@@ -52,6 +54,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     :param request_timeout: when looking for group, wait for a response from leader for at most this many seconds.
														
 
															     :note: request_timeout must be smaller than averaging_expiration to avoid potential deadlocks.
														
 
															     :param chunk_size_bytes: tensors for AllReduce will be divided into chunks of this size (to improve gRPC throughput)
														
 
															+    :param throughput: if specified, this value represents the network bandwidth available to averager.
														
 
															+          By default, the averager is assumed to have the average bandwidth of his group.
														
 
															+          If throughput == 0, averager will run in client-only mode (TODO not implemented yet!)
														
 
															     :param listen: if True (default), this averager will accept incoming requests from other peers and perform allreduce
														
 
															             if False, the averager will register as a freeloader and attempt to fetch vectors from other averagers
														
 
															     :param listen_on: network interface, e.g. "0.0.0.0:1337" or "localhost:*" (* means pick any port) or "[::]:7654"
														
@@ -65,15 +70,21 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     """
														
 
															     _matchmaking: Matchmaking
														
 
															     _pending_group_assembled: asyncio.Event
														
 
															+    serializer = MSGPackSerializer
														
 
															     def __init__(self, averaged_tensors: Sequence[torch.Tensor], dht: hivemind.dht.DHT, *, start: bool,
														
 
															                  prefix: str, target_group_size: int, min_group_size: int = 2, initial_group_bits: Optional[str] = None,
														
 
															                  averaging_expiration: float = 15, request_timeout: float = 3, chunk_size_bytes: int = 2 ** 16,
														
 
															                  allreduce_timeout: Optional[float] = None, averaging_alpha: float = 1.0,
														
 
															                  compression_type: runtime_pb2.CompressionType = runtime_pb2.CompressionType.NONE,
														
 
															-                 listen_on: Endpoint = '0.0.0.0:*', receiver_threads: int = 1, daemon: bool = True,
														
 
															+                 throughput: Optional[float] = None, min_vector_size: int = 0,
														
 
															+                 listen: bool = True, listen_on: Endpoint = '0.0.0.0:*', receiver_threads: int = 1, daemon: bool = True,
														
 
															                  channel_options: Optional[Sequence[Tuple[str, Any]]] = None, **kwargs):
														
 
															         assert '.' not in prefix, "group prefix must be a string without trailing '.'"
														
 
															+        assert throughput is None or (throughput >= 0 and np.isfinite(np.float32(throughput))), "throughput must be a" \
														
 
															+                                                                                                " nonnegative float32"
														
 
															+        if not listen:
														
 
															+            raise NotImplementedError("Client-only averaging is not implemented yet.")
														
 
															         if not is_power_of_two(target_group_size):
														
 
															             logger.warning("It is recommended to set target_group_size to a power of 2.")
														
 
															         if initial_group_bits is None:
														
@@ -96,8 +107,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         self.matchmaking_kwargs = dict(
														
 
															             prefix=prefix, initial_group_bits=initial_group_bits, target_group_size=target_group_size,
														
 
															             min_group_size=min_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout,
														
 
															-            chunk_size_bytes=chunk_size_bytes, compression_type=compression_type)
														
 
															-        self.averaging_alpha, self.allreduce_timeout = averaging_alpha, allreduce_timeout
														
 
															+            chunk_size_bytes=chunk_size_bytes, compression_type=compression_type,
														
 
															+            throughput=throughput, min_vector_size=min_vector_size)
														
 
															+        self._averaging_alpha, self._allreduce_timeout = averaging_alpha, allreduce_timeout
														
 
															         self._running_groups: Dict[GroupID, AllReduceRunner] = {}  # one or more assembled groups that run all-reduce
														
 
															         self._pipe, self.pipe = mp.Pipe(duplex=True)  # a control pipe used to communicate with a background process
														
@@ -114,8 +126,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															     @property
														
 
															     def endpoint(self) -> Endpoint:
														
 
															+        assert self.port is not None, "Averager is not running yet"
														
 
															         if self._averager_endpoint is None:
														
 
															-            self._averager_endpoint = replace_port(self.listen_on, self.port if self.port is not None else '*')
														
 
															+            self._averager_endpoint = f"{self.dht.get_visible_address()}:{self.port}"
														
 
															             logger.debug(f"Assuming averager endpoint to be {self._averager_endpoint}")
														
 
															         return self._averager_endpoint
														
@@ -165,48 +178,51 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         else:
														
 
															             logger.warning("DHT shutdown has no effect: the process is not alive")
														
 
															-    def step(self, allow_retries: bool = True, timeout: Optional[float] = None, wait=True
														
 
															-             ) -> Union[bool, MPFuture]:
														
 
															+    def step(self, allow_retries: bool = True, gather: Optional[DataForGather] = None, timeout: Optional[float] = None,
														
 
															+             wait=True) -> Union[Optional[Dict[Endpoint, DataForGather]], MPFuture]:
														
 
															         """
														
 
															         Set up the averager to look for a group and run one round of averaging, return True on success, False on failure
														
 
															+
														
 
															         :param allow_retries: if averager fails to run one round of allreduce, this option will allow it to try again
														
 
															           within the specified timeout
														
 
															+        :param gather: optionally send this informaton to all peers in the next group and gather it from every groupmate
														
 
															+          (this operation is known as all-gather). The gathered data will be available as the output of this function.
														
 
															         :param timeout: if averager was unable to *find* a group in this many seconds, consider allreduce failedK
														
 
															         :param wait: if True (default), return when finished. Otherwise return MPFuture and run in background.
														
 
															+        :returns: on success, update averaged_tensors and return group info; on failure, return None
														
 
															         """
														
 
															         future, _future = MPFuture.make_pair()
														
 
															-        self.pipe.send(('_step', [], dict(future=_future, allow_retries=allow_retries, timeout=timeout)))
														
 
															+        self.pipe.send(('_step', [], dict(future=_future, gather=gather, allow_retries=allow_retries, timeout=timeout)))
														
 
															         return future.result() if wait else future
														
 
															-    async def _step(self, *, future: MPFuture, allow_retries: bool, timeout: Optional[float]):
														
 
															+    async def _step(self, *, future: MPFuture, gather: DataForGather, allow_retries: bool, timeout: Optional[float]):
														
 
															         loop = asyncio.get_event_loop()
														
 
															         start_time = get_dht_time()
														
 
															-
														
 
															-        try_averaging = True
														
 
															         group_id = None
														
 
															-        while try_averaging:
														
 
															+        while not future.done():
														
 
															             try:
														
 
															                 self._pending_group_assembled.clear()
														
 
															-                allreduce_group = await self._matchmaking.look_for_group(timeout=timeout)
														
 
															+                gather_binary = self.serializer.dumps(gather)
														
 
															+                allreduce_group = await self._matchmaking.look_for_group(timeout=timeout, data_for_gather=gather_binary)
														
 
															                 if allreduce_group is None:
														
 
															                     raise AllreduceException("Averaging step failed: could not find a group.")
														
 
															                 group_id = allreduce_group.group_id
														
 
															                 self._running_groups[group_id] = allreduce_group
														
 
															                 self._pending_group_assembled.set()
														
 
															-                await asyncio.wait_for(allreduce_group.run(), self.allreduce_timeout)
														
 
															-                update_ok = await loop.run_in_executor(None, self.update_tensors, allreduce_group)
														
 
															+                await asyncio.wait_for(allreduce_group.run(), self._allreduce_timeout)
														
 
															+                await loop.run_in_executor(None, self.update_tensors, allreduce_group)
														
 
															                 # averaging is finished, exit the loop
														
 
															-                future.set_result(update_ok)
														
 
															-                try_averaging = False
														
 
															+                gathered_items = map(self.serializer.loads, allreduce_group.gathered)
														
 
															+                gathered_data_by_peer = dict(zip(allreduce_group.ordered_group_endpoints, gathered_items))
														
 
															+                future.set_result(gathered_data_by_peer)
														
 
															             except AllreduceException:
														
 
															                 time_elapsed = get_dht_time() - start_time
														
 
															                 if not allow_retries or (timeout is not None and timeout < time_elapsed):
														
 
															-                    future.set_result(False)
														
 
															-                    try_averaging = False
														
 
															+                    future.set_result(None)
														
 
															             except Exception as e:
														
 
															                 future.set_exception(e)
														
@@ -215,11 +231,9 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															                 _ = self._running_groups.pop(group_id, None)
														
 
															                 self._pending_group_assembled.set()
														
 
															-    def update_tensors(self, allreduce_group: AllReduceRunner) -> bool:
														
 
															+    def update_tensors(self, allreduce_group: AllReduceRunner):
														
 
															         """
														
 
															         a private (extendable) method that applies changes from a finished allreduce to local tensors
														
 
															-
														
 
															-        :return: True on success, False on failure
														
 
															         """
														
 
															         assert allreduce_group.return_deltas and allreduce_group.future.done()
														
 
															         averaging_deltas = allreduce_group.future.result()
														
@@ -227,8 +241,7 @@ class DecentralizedAverager(mp.Process, averaging_pb2_grpc.DecentralizedAveragin
 
															         with torch.no_grad(), self.get_tensors() as local_tensors:
														
 
															             assert len(local_tensors) == len(self._averaged_tensors)
														
 
															             for tensor, update in zip(local_tensors, averaging_deltas):
														
 
															-                tensor.add_(update, alpha=self.averaging_alpha)
														
 
															-            return True
														
 
															+                tensor.add_(update, alpha=self._averaging_alpha)
														
 
															     @contextlib.contextmanager
														
 
															     def get_tensors(self) -> Sequence[torch.Tensor]:
														
--- a/hivemind/client/averaging/allreduce.py
+++ b/hivemind/client/averaging/allreduce.py
@@ -1,5 +1,5 @@
 
															 import asyncio
														
 
															-from typing import Sequence, Set, Dict, Tuple, Iterable, AsyncIterator, Iterator
														
 
															+from typing import Sequence, Set, Dict, Tuple, Iterable, AsyncIterator, Any
														
 
															 import grpc
														
 
															 import torch
														
@@ -20,15 +20,17 @@ class AllReduceProtocol:
 
															     :param tensors: local tensors that should be averaged with groupmates
														
 
															     :param endpoint: your endpoint, must be included in ordered_group_endpoints
														
 
															     :param ordered_group_endpoints: group endpoints ordered s.t. i-th endpoint is responsible for averaging i-th part
														
 
															+    :param part_sizes: for each peer, a number of vector elements that this peer is responsible for averaging
														
 
															     :param return_deltas: if True, returns the element-wise differences (averaged_tensors - original_tensors)
														
 
															            default (False) - return averaged_tensors by themselves
														
 
															     """
														
 
															     def __init__(self, *, group_id: GroupID, tensors: Sequence[torch.Tensor], endpoint: Endpoint,
														
 
															-                 ordered_group_endpoints: Sequence[Endpoint], return_deltas: bool = False):
														
 
															+                 ordered_group_endpoints: Sequence[Endpoint], part_sizes: Tuple[int, ...], return_deltas: bool = False):
														
 
															         assert endpoint in ordered_group_endpoints, "endpoint is not a part of the group"
														
 
															-        self.group_id, self.endpoint, self.ordered_group_endpoints = group_id, endpoint, ordered_group_endpoints
														
 
															-        self.local_tensor_parts = dict(zip(ordered_group_endpoints, split_into_parts(tensors, self.group_size)))
														
 
															+        self.group_id, self.endpoint = group_id, endpoint
														
 
															+        self.ordered_group_endpoints, self.part_sizes = ordered_group_endpoints, part_sizes
														
 
															+        self.local_tensor_parts = dict(zip(ordered_group_endpoints, split_into_parts(tensors, part_sizes)))
														
 
															         self.tensor_shapes = tuple(tensor.shape for tensor in tensors)
														
 
															         self.return_deltas = return_deltas
														
@@ -121,17 +123,18 @@ class AllReduceRunner(AllReduceProtocol, averaging_pb2_grpc.DecentralizedAveragi
 
															     def __init__(self, *, group_id: GroupID, tensors: Sequence[torch.Tensor], endpoint: Endpoint,
														
 
															                  ordered_group_endpoints: Sequence[Endpoint], compression_type: runtime_pb2.CompressionType,
														
 
															-                 chunk_size_bytes: int, return_deltas: bool = False):
														
 
															-        super().__init__(group_id=group_id, tensors=tensors, endpoint=endpoint,
														
 
															+                 chunk_size_bytes: int, part_sizes: Tuple[int, ...], gathered: Sequence[Any] = (),
														
 
															+                 return_deltas: bool = False):
														
 
															+        super().__init__(group_id=group_id, tensors=tensors, endpoint=endpoint, part_sizes=part_sizes,
														
 
															                          ordered_group_endpoints=ordered_group_endpoints, return_deltas=return_deltas)
														
 
															-        self.compression_type, self.chunk_size_bytes = compression_type, chunk_size_bytes
														
 
															+        self.compression_type, self.chunk_size_bytes, self.gathered = compression_type, chunk_size_bytes, gathered
														
 
															         self.averaged_part_stream: asyncio.Future[Tuple[runtime_pb2.Tensor, ...]] = asyncio.Future()
														
 
															     def _get_peer_stub(self, peer: Endpoint) -> averaging_pb2_grpc.DecentralizedAveragingStub:
														
 
															         return ChannelCache.get_stub(peer, averaging_pb2_grpc.DecentralizedAveragingStub, aio=True)
														
 
															-    async def _average_one_part(self, peer_endpoint: Endpoint, local_part: torch.Tensor) -> torch.Tensor:
														
 
															-        """ Send one part of local tensors to one groupmate and collect the average for this part """
														
 
															+    async def _communicate_with_peer(self, peer_endpoint: Endpoint, local_part: torch.Tensor) -> torch.Tensor:
														
 
															+        """ Send a part of local tensors and metadata to a single peer, receive the average for that part of tensors """
														
 
															         serialized_tensor_part = serialize_torch_tensor(local_part, self.compression_type, allow_inplace=False)
														
 
															         chunks = split_for_streaming(serialized_tensor_part, self.chunk_size_bytes)
														
@@ -163,7 +166,7 @@ class AllReduceRunner(AllReduceProtocol, averaging_pb2_grpc.DecentralizedAveragi
 
															         send allreduce requests to all peers and collect results, return the averaged tensor (or deltas)
														
 
															         """
														
 
															         try:
														
 
															-            await asyncio.gather(self, *(self._average_one_part(peer, part)
														
 
															+            await asyncio.gather(self, *(self._communicate_with_peer(peer, part)
														
 
															                                          for peer, part in self.local_tensor_parts.items() if peer != self.endpoint))
														
 
															             return await self
														
 
															         except BaseException as e:
														
@@ -203,6 +206,7 @@ class AllReduceRunner(AllReduceProtocol, averaging_pb2_grpc.DecentralizedAveragi
 
															                 yield averaging_pb2.AveragingData(code=averaging_pb2.AVERAGED_PART, tensor_part=next(averaged_chunks))
														
 
															                 for averaged_chunk in averaged_chunks:
														
 
															                     yield averaging_pb2.AveragingData(tensor_part=averaged_chunk)
														
 
															+
														
 
															             except Exception as e:
														
 
															                 self.set_exception(e)
														
 
															                 yield averaging_pb2.AveragingData(code=averaging_pb2.INTERNAL_ERROR)
														
@@ -213,12 +217,10 @@ class AllReduceRunner(AllReduceProtocol, averaging_pb2_grpc.DecentralizedAveragi
 
															             yield averaging_pb2.AveragingData(code=averaging_pb2.INTERNAL_ERROR)
														
 
															-def split_into_parts(tensors: Sequence[torch.Tensor], group_size: int) -> Tuple[torch.Tensor, ...]:
														
 
															+def split_into_parts(tensors: Sequence[torch.Tensor], part_sizes: Tuple[int]) -> Tuple[torch.Tensor, ...]:
														
 
															     """ combines averaged_tensors into one tensor and splits them into equal chunks of size group_size """
														
 
															     flat_tensor = torch.cat(tuple(map(torch.Tensor.flatten, tensors)))
														
 
															-    chunk_slices = torch.linspace(start=0, end=len(flat_tensor), steps=group_size + 1, dtype=torch.int64)
														
 
															-    chunk_slices[-1] = len(flat_tensor)
														
 
															-    return tuple(flat_tensor[chunk_slices[i]: chunk_slices[i + 1]] for i in range(group_size))
														
 
															+    return torch.split_with_sizes(flat_tensor, part_sizes, dim=0)
														
 
															 def restore_from_parts(chunks: Sequence[torch.Tensor], shapes: Sequence[torch.Size]) -> Tuple[torch.Tensor, ...]:
														
--- a/hivemind/client/averaging/load_balancing.py
+++ b/hivemind/client/averaging/load_balancing.py
@@ -0,0 +1,98 @@
 
															+from typing import Sequence, Optional, Tuple
														
 
															+import numpy as np
														
 
															+import scipy.optimize
														
 
															+
														
 
															+from hivemind.utils.logging import get_logger
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+
														
 
															+def load_balance_peers(vector_size, throughputs: Sequence[Optional[float]], min_size: int = 0) -> Tuple[int, ...]:
														
 
															+    """
														
 
															+    Find an optimal partitioning of weights for butterfly all-reduce given peer throughputs.
														
 
															+    :param vector_size: total size of the averaged vector (in elements, not bytes)
														
 
															+    :param throughputs: 1d array of non-negative throughputs for each peer, typically min(upload speed, download speed)
														
 
															+    :param min_size: peers that can aggregate less than this many elements will be assigned nothing
														
 
															+    :returns: an integer array where i-th element is the number of weights assigned to i-th peer
														
 
															+    """
														
 
															+    specified_throughputs = [throughput for throughput in throughputs if throughput is not None and throughput > 0]
														
 
															+
														
 
															+    if specified_throughputs:
														
 
															+        default_throughput = np.mean(specified_throughputs)
														
 
															+        throughputs = [throughput if throughput is not None else default_throughput for throughput in throughputs]
														
 
															+        scores = optimize_parts_lp(vector_size, np.asarray(throughputs), min_size)
														
 
															+    else:
														
 
															+        assert not all(throughput == 0 for throughput in throughputs), "Must have at least one nonzero throughput"
														
 
															+        scores = np.asarray([1.0 if throughput is None else 0.0 for throughput in throughputs])
														
 
															+
														
 
															+    return tuple(hagenbach_bishoff(vector_size, scores))
														
 
															+
														
 
															+
														
 
															+def optimize_parts_lp(vector_size: int, throughputs: np.ndarray, min_size: int = 0, eps: float = 1e-15) -> np.ndarray:
														
 
															+    """
														
 
															+    This method solves an optimization problem to minimize the total allreduce time.
														
 
															+    In butterfly all-reduce, each peer acts both as a "client" and as an "aggregator":
														
 
															+    * a "client" splits his local vector into shards and sends each shard to one peer, then downloads the average
														
 
															+    * an "aggregator" receives a certain part of vector components from all peers, aggregates and returns the average
														
 
															+
														
 
															+    Peer i network load as a "client" = vector_size * (1 - fraction_assigned_to_peer_i)
														
 
															+    Peer i network load as an "aggregator" = vector_size * (group_size - 1) * fraction_assigned_to_peer_i
														
 
															+    Peer i total communication = vector_size * [1 + (group_size - 2) * fraction_assigned_to_peer_i]
														
 
															+    Total time = max_i (total_communication_for_peer_i / throughputs[i])
														
 
															+
														
 
															+    We solve this optimization problem by reducing it to linear programming with a minimax reduction
														
 
															+    (see lecture notes: https://www.usna.edu/Users/math/dphillip/sa305.s15/phillips/lessons/32/32.pdf )
														
 
															+
														
 
															+    :returns: a vector of "scores", i-th score is proportional to the fraction of weights assigned to i-th peer
														
 
															+    """
														
 
															+    assert np.all(throughputs >= 0) and np.any(throughputs > 0)
														
 
															+    permutation = np.argsort(-throughputs)
														
 
															+    throughputs = throughputs[permutation]
														
 
															+    is_nonzero = throughputs != 0
														
 
															+
														
 
															+    group_size = len(throughputs)
														
 
															+    num_variables = group_size + 1  # [w_1, ..., w_N, xi]
														
 
															+
														
 
															+    c = np.zeros(num_variables)
														
 
															+    c[-1] = 1.0  # optimize w.r.t. xi
														
 
															+
														
 
															+    # the constraints below are tuples (A, b) such that Ax <= b
														
 
															+    nonnegative_weights = -np.eye(group_size, M=num_variables), np.zeros(group_size)
														
 
															+    weights_sum_to_one = c[None, :] - 1.0, np.array([-1.0])
														
 
															+    coeff_per_variable = (group_size - 2.0) / np.maximum(throughputs, eps)
														
 
															+    coeff_matrix_minus_xi = np.hstack([np.diag(coeff_per_variable), -np.ones((group_size, 1))])
														
 
															+    xi_is_maximum = coeff_matrix_minus_xi[is_nonzero], -1.0 / throughputs[is_nonzero]
														
 
															+    force_max_weights = np.eye(group_size, M=num_variables), is_nonzero.astype(c.dtype)
														
 
															+
														
 
															+    A, b = list(map(np.concatenate, zip(nonnegative_weights, weights_sum_to_one, xi_is_maximum, force_max_weights)))
														
 
															+
														
 
															+    solution = scipy.optimize.linprog(c, A_ub=A, b_ub=b)
														
 
															+    if solution.success:
														
 
															+        peer_scores = solution.x[:group_size]
														
 
															+        # if some peers have less than min_size elements, transfer their share to other peers (if any)
														
 
															+        if np.max(peer_scores) >= min_size / float(vector_size):
														
 
															+            peer_scores[peer_scores < min_size / float(vector_size)] = 0.0
														
 
															+    else:
														
 
															+        logger.error(f"Failed to solve load-balancing for bandwidths {throughputs}.")
														
 
															+        peer_scores = np.ones(group_size)
														
 
															+
														
 
															+    return peer_scores[np.argsort(permutation)]
														
 
															+
														
 
															+
														
 
															+def hagenbach_bishoff(vector_size: int, scores: Sequence[float]) -> Sequence[int]:
														
 
															+    """
														
 
															+    Split a vector between participants based on continuous fractions.
														
 
															+    https://en.wikipedia.org/wiki/Hagenbach-Bischoff_system
														
 
															+    The code is based on https://github.com/crflynn/voting
														
 
															+
														
 
															+    :param vector_size: the total number of elements to be split
														
 
															+    :param scores: real-valued vector fractions for each peer
														
 
															+    :returns: integer-valued partitions assigned to every peer
														
 
															+    """
														
 
															+    total_score = sum(scores)
														
 
															+    allocated = [int(vector_size * score_i / total_score) for score_i in scores]
														
 
															+    while sum(allocated) < vector_size:
														
 
															+        quotients = [score / (allocated[idx] + 1) for idx, score in enumerate(scores)]
														
 
															+        idx_max = quotients.index(max(quotients))
														
 
															+        allocated[idx_max] += 1
														
 
															+    return allocated
														
--- a/hivemind/client/averaging/matchmaking.py
+++ b/hivemind/client/averaging/matchmaking.py
@@ -6,20 +6,20 @@ import contextlib
 
															 import random
														
 
															 from dataclasses import asdict
														
 
															 from math import isfinite
														
 
															-from typing import Sequence, Optional, AsyncIterator, Set, Tuple
														
 
															+from typing import Sequence, Optional, AsyncIterator, Set, Tuple, Dict
														
 
															 import asyncio
														
 
															-import torch
														
 
															 import grpc
														
 
															+import torch
														
 
															 import hivemind
														
 
															-from hivemind.client.averaging.allreduce import AllReduceRunner, GroupID
														
 
															+from hivemind.client.averaging.allreduce import AllReduceRunner
														
 
															+from hivemind.client.averaging.load_balancing import load_balance_peers
														
 
															 from hivemind.dht import DHTID, DHTExpiration, get_dht_time, GroupKey
														
 
															 from hivemind.utils import get_logger, Endpoint, TensorDescriptor, MSGPackSerializer, TimedStorage
														
 
															-from hivemind.proto import averaging_pb2, averaging_pb2_grpc, runtime_pb2
														
 
															+from hivemind.proto import averaging_pb2, averaging_pb2_grpc
														
 
															 from hivemind.utils.grpc import ChannelCache
														
 
															-
														
 
															 logger = get_logger(__name__)
														
@@ -34,12 +34,12 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															       This deadlock only happens if averagers have outdated information on expirations (due to network delays). 
														
 
															       While A->B->A deadlock is easy to fix, it gets much harder with more peers (e.g. A -> B -> C -> D -> A).
														
 
															       Hence, instead of accounting for such deadlocks, we simply break them with request_timeout.
														
 
															-    
														
 
															     """
														
 
															     def __init__(self, endpoint: Endpoint, averaged_tensors: Sequence[torch.Tensor], dht: hivemind.dht.DHT, *,
														
 
															                  prefix: str, target_group_size: int, min_group_size: int, initial_group_bits: Optional[str] = None,
														
 
															-                 averaging_expiration: float = 15, request_timeout: float, **allreduce_kwargs):
														
 
															+                 averaging_expiration: float = 15, request_timeout: float, throughput: Optional[float] = None,
														
 
															+                 min_vector_size: int, **allreduce_kwargs):
														
 
															         assert '.' not in prefix, "group prefix must be a string without ."
														
 
															         if request_timeout is None or request_timeout >= averaging_expiration:
														
 
															             logger.warning("It is recommended to use request_timeout smaller than averaging_expiration. Otherwise,"
														
@@ -50,8 +50,10 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															         self.prefix, self.group_bits = prefix, initial_group_bits
														
 
															         self.target_group_size, self.min_group_size = target_group_size, min_group_size
														
 
															         self.averaging_expiration, self.request_timeout = averaging_expiration, request_timeout
														
 
															+        self.throughput, self.min_vector_size = throughput, min_vector_size
														
 
															         self.allreduce_kwargs = allreduce_kwargs
														
 
															         self.schema_hash = compute_schema_hash(self.averaged_tensors)
														
 
															+        self.total_size = sum(tensor.numel() for tensor in self.averaged_tensors)
														
 
															         self.lock_looking_for_group = asyncio.Lock()
														
 
															         self.lock_request_join_group = asyncio.Lock()
														
@@ -60,8 +62,9 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															         self.assembled_group = asyncio.Future()
														
 
															         self.current_leader: Optional[Endpoint] = None  # iff i am a follower, this is a link to my current leader
														
 
															-        self.current_followers: Set[Endpoint] = set()  # iff i am a leader, this contains my followers excluding myself
														
 
															+        self.current_followers: Dict[Endpoint, averaging_pb2.JoinRequest] = {}  # my current followers excluding myself
														
 
															         self.potential_leaders = PotentialLeaders(endpoint, dht, averaging_expiration, target_group_size)
														
 
															+        self.data_for_gather: bytes = None
														
 
															     @property
														
 
															     def is_looking_for_group(self):
														
@@ -82,8 +85,11 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															         return f"{self.__class__.__name__}(endpoint={self.endpoint}, schema={schema_hash_repr}, {lfg_status}" \
														
 
															                f" current key = {self.current_group_key})"
														
 
															-    async def look_for_group(self, *, timeout: Optional[float] = None) -> Optional[AllReduceRunner]:
														
 
															+    async def look_for_group(self, *, data_for_gather: bytes = b'', timeout: Optional[float] = None
														
 
															+                             ) -> Optional[AllReduceRunner]:
														
 
															         """
														
 
															+        :param gather: optionally send this data to all peers in the next group and gather it from every groupmate
														
 
															+        :param timeout: maximum time that may be spent looking for group (does not include allreduce itself)
														
 
															         :returns: an assembled group if successful, None if failed; does NOT perform the actual averaging
														
 
															         Iterate over the averagers from a given group_identifier that have higher leadership priority than yourself.
														
 
															         """
														
@@ -91,6 +97,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															             logger.info("Another look_for_group is already in progress. The current run will be scheduled after"
														
 
															                         " the existing group is either assembled or disbanded.")
														
 
															         async with self.lock_looking_for_group:
														
 
															+            self.data_for_gather = data_for_gather
														
 
															             request_leaders_task = asyncio.create_task(self._request_join_potential_leaders(timeout))
														
 
															             try:
														
 
															                 return await asyncio.wait_for(self.assembled_group, timeout=timeout)
														
@@ -116,6 +123,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															                 # note: the code above ensures that we send all followers away before creating new future
														
 
															                 self.assembled_group = asyncio.Future()
														
 
															                 self.was_accepted_to_group.clear()
														
 
															+                self.data_for_gather = None
														
 
															     async def _request_join_potential_leaders(self, timeout: Optional[float]) -> AllReduceRunner:
														
 
															         """ Request leaders from queue until we find the first runner. This coroutine is meant to run in background. """
														
@@ -161,7 +169,9 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															             async with self.lock_request_join_group:
														
 
															                 leader_stub = ChannelCache.get_stub(leader, averaging_pb2_grpc.DecentralizedAveragingStub, aio=True)
														
 
															                 call = leader_stub.rpc_join_group(averaging_pb2.JoinRequest(
														
 
															-                    endpoint=self.endpoint, schema_hash=self.schema_hash, expiration=expiration_time))
														
 
															+                    endpoint=self.endpoint, schema_hash=self.schema_hash, expiration=expiration_time,
														
 
															+                    throughput=self.throughput if self.throughput is not None else -1.0,
														
 
															+                    gather=self.data_for_gather))
														
 
															                 message = await asyncio.wait_for(call.read(), timeout=self.request_timeout)
														
 
															                 if message.code == averaging_pb2.ACCEPTED:
														
@@ -182,8 +192,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															                 if message.code == averaging_pb2.BEGIN_ALLREDUCE:
														
 
															                     async with self.lock_request_join_group:
														
 
															-                        return await self.follower_assemble_group(
														
 
															-                            leader, message.group_id, message.ordered_group_endpoints)
														
 
															+                        return await self.follower_assemble_group(leader, message)
														
 
															             if message.code in (averaging_pb2.GROUP_DISBANDED, averaging_pb2.CANCELLED):
														
 
															                 if message.suggested_leader and message.suggested_leader != self.endpoint:
														
@@ -218,7 +227,7 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															                     yield reason_to_reject
														
 
															                     return
														
 
															-                self.current_followers.add(request.endpoint)
														
 
															+                self.current_followers[request.endpoint] = request
														
 
															                 yield averaging_pb2.MessageFromLeader(code=averaging_pb2.ACCEPTED)
														
 
															                 if len(self.current_followers) + 1 >= self.target_group_size and not self.assembled_group.done():
														
@@ -253,14 +262,15 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															             allreduce_group = self.assembled_group.result()
														
 
															             yield averaging_pb2.MessageFromLeader(
														
 
															                 code=averaging_pb2.BEGIN_ALLREDUCE, group_id=allreduce_group.group_id,
														
 
															-                ordered_group_endpoints=allreduce_group.ordered_group_endpoints)
														
 
															+                ordered_group_endpoints=allreduce_group.ordered_group_endpoints,
														
 
															+                part_sizes=allreduce_group.part_sizes, gathered=allreduce_group.gathered)
														
 
															         except Exception as e:
														
 
															             logger.exception(e)
														
 
															             yield averaging_pb2.MessageFromLeader(code=averaging_pb2.INTERNAL_ERROR)
														
 
															         finally:  # note: this code is guaranteed to run even if the coroutine is destroyed prematurely
														
 
															-            self.current_followers.discard(request.endpoint)
														
 
															+            self.current_followers.pop(request.endpoint, None)
														
 
															             self.follower_was_discarded.set()
														
 
															     def _check_reasons_to_reject(self, request: averaging_pb2.JoinRequest) -> Optional[averaging_pb2.MessageFromLeader]:
														
@@ -297,22 +307,40 @@ class Matchmaking(averaging_pb2_grpc.DecentralizedAveragingServicer):
 
															         ordered_group_endpoints = list(self.current_followers)
														
 
															         ordered_group_endpoints.append(self.endpoint)
														
 
															         random.shuffle(ordered_group_endpoints)
														
 
															+
														
 
															+        throughputs, gathered = [], []
														
 
															+        for endpoint in ordered_group_endpoints:
														
 
															+            if endpoint == self.endpoint:
														
 
															+                throughputs.append(self.throughput)
														
 
															+                gathered.append(self.data_for_gather)
														
 
															+            else:
														
 
															+                follower_info = self.current_followers[endpoint]
														
 
															+                throughputs.append(follower_info.throughput if follower_info.throughput >= 0 else None)
														
 
															+                gathered.append(follower_info.gather if follower_info.gather else None)
														
 
															+
														
 
															+        part_sizes = load_balance_peers(self.total_size, throughputs, self.min_vector_size)
														
 
															+
														
 
															         logger.debug(f"{self.endpoint} - leader started allreduce for {len(ordered_group_endpoints)} peers.")
														
 
															         allreduce_group = AllReduceRunner(group_id=group_id, tensors=self.averaged_tensors, endpoint=self.endpoint,
														
 
															-                                          ordered_group_endpoints=ordered_group_endpoints, **self.allreduce_kwargs)
														
 
															+                                          ordered_group_endpoints=ordered_group_endpoints, part_sizes=part_sizes,
														
 
															+                                          gathered=gathered, **self.allreduce_kwargs)
														
 
															         self.assembled_group.set_result(allreduce_group)
														
 
															         return allreduce_group
														
 
															-    async def follower_assemble_group(self, leader: Endpoint, group_id: GroupID,
														
 
															-                                      ordered_group_endpoints: Sequence[Endpoint]) -> AllReduceRunner:
														
 
															+    async def follower_assemble_group(self, leader: Endpoint, msg: averaging_pb2.MessageFromLeader) -> AllReduceRunner:
														
 
															         """ Prepare to run allreduce using a list of peers provided by our leader """
														
 
															         assert self.lock_looking_for_group.locked() and self.lock_request_join_group.locked()
														
 
															         assert not self.assembled_group.done()
														
 
															-        logger.debug(f"{self.endpoint} - follower started allreduce after being prompted by leader {leader}.")
														
 
															         assert self.current_leader == leader, f"averager does not follow {leader} (actual: {self.current_leader})"
														
 
															+
														
 
															+        group_id, ordered_group_endpoints, part_sizes = msg.group_id, msg.ordered_group_endpoints, msg.part_sizes
														
 
															         assert self.endpoint in ordered_group_endpoints, "Leader sent us group_endpoints that does not contain us!"
														
 
															+        assert len(ordered_group_endpoints) == len(part_sizes) == len(msg.gathered)
														
 
															+
														
 
															+        logger.debug(f"{self.endpoint} - follower started allreduce after being prompted by leader {leader}.")
														
 
															         allreduce_group = AllReduceRunner(group_id=group_id, tensors=self.averaged_tensors, endpoint=self.endpoint,
														
 
															-                                          ordered_group_endpoints=ordered_group_endpoints, **self.allreduce_kwargs)
														
 
															+                                          ordered_group_endpoints=tuple(ordered_group_endpoints),
														
 
															+                                          part_sizes=tuple(part_sizes), gathered=msg.gathered, **self.allreduce_kwargs)
														
 
															         self.assembled_group.set_result(allreduce_group)
														
 
															         return allreduce_group
														
--- a/hivemind/proto/averaging.proto
+++ b/hivemind/proto/averaging.proto
@@ -33,6 +33,8 @@ message JoinRequest {
 
															   string endpoint = 1;          // A follower accepts incoming allreduce requests at this address
														
 
															   bytes schema_hash = 2;        // A hash that describes follower's tensors (shapes, num tensors, etc)
														
 
															   double expiration = 3;        // Follower would like to **begin** all_reduce by this point in time
														
 
															+  bytes gather = 4;             // optional metadata that is gathered from all peers (e.g. batch size or current loss)
														
 
															+  float throughput = 5;         // Follower has this bandwidth for averaging (0 = default, negative = client only)
														
 
															 }
														
 
															 message MessageFromLeader {
														
@@ -40,11 +42,13 @@ message MessageFromLeader {
 
															   bytes group_id = 2;        // a unique identifier of this group, only valid until allreduce is finished/failed
														
 
															   string suggested_leader = 3;  // if peer is already in a group, it'll provide us with an endpoint of its leader
														
 
															   repeated string ordered_group_endpoints = 4;  // a sequence of peers, each responsible for one shard during averaging
														
 
															+  repeated int32 part_sizes = 5;  // a sequence of tensor parts assigned to each peer, same order as endpoints
														
 
															+  repeated bytes gathered = 6;  // metadata (gather) from all groupmates in the same order as their endoints
														
 
															 }
														
 
															 message AveragingData {
														
 
															   MessageCode code = 1;     // in case of a protocol violation, this will be the error message
														
 
															-  bytes group_id = 2;        // a unique group identifier, same as in MessageFromLeader
														
 
															+  bytes group_id = 2;       // a unique group identifier, same as in MessageFromLeader
														
 
															   string endpoint = 3;      // sender's rpc endpoint, used for coordination
														
 
															-  Tensor tensor_part = 4;    // either peer's local tensor part (rpc input) or group average of this part (rpc output)
														
 
															+  Tensor tensor_part = 4;   // either peer's local tensor part (rpc input) or group average of this part (rpc output)
														
 
															 }
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 
															 PyYAML
														
 
															 torch>=1.6.0
														
 
															 numpy>=1.17
														
 
															+scipy>=1.2.1
														
 
															 prefetch_generator>=1.0.1
														
 
															 msgpack>=0.5.6
														
 
															 sortedcontainers
														
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -1,11 +1,12 @@
 
															 import asyncio
														
 
															 import random
														
 
															-import time
														
 
															+import numpy as np
														
 
															 import torch
														
 
															 import pytest
														
 
															 import hivemind
														
 
															 from hivemind.client.averaging.allreduce import AllReduceProtocol, split_into_parts, restore_from_parts
														
 
															+from hivemind.client.averaging.load_balancing import load_balance_peers
														
 
															 from hivemind.utils import Endpoint
														
@@ -35,7 +36,7 @@ def test_getset_averagers():
 
															 @pytest.mark.forked
														
 
															 def test_allreduce_once():
														
 
															-    dht = hivemind.DHT(start=True)
														
 
															+    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')
														
 
															     tensors1 = [torch.randn(123), torch.zeros(3)]
														
 
															     tensors2 = [torch.rand(123), torch.ones(3)]
														
@@ -53,7 +54,9 @@ def test_allreduce_once():
 
															     for averager in averagers:
														
 
															         futures.append(averager.step(wait=False))
														
 
															     for future in futures:
														
 
															-        assert future.result() is True
														
 
															+        result = future.result()
														
 
															+        for averager in averagers:
														
 
															+            assert averager.endpoint in result
														
 
															     for averager in averagers:
														
 
															         with averager.get_tensors() as averaged_tensors:
														
@@ -61,6 +64,31 @@ def test_allreduce_once():
 
															                 assert torch.allclose(ref, our, atol=1e-6)
														
 
															+@pytest.mark.forked
														
 
															+def test_allgather():
														
 
															+    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')
														
 
															+    averagers = [hivemind.DecentralizedAverager(torch.ones(1), dht=dht, target_group_size=4, averaging_expiration=15,
														
 
															+                                                prefix='mygroup', initial_group_bits='000', listen_on='127.0.0.1:*',
														
 
															+                                                start=True)
														
 
															+                 for _ in range(8)]
														
 
															+
														
 
															+    futures = []
														
 
															+    for i, averager in enumerate(averagers):
														
 
															+        futures.append(averager.step(wait=False, gather=dict(batch_size=123 + i, foo='bar')))
														
 
															+
														
 
															+    assert len(set(repr(sorted(future.result())) for future in futures)) == 2
														
 
															+
														
 
															+    reference_metadata = {averager.endpoint: dict(batch_size=123 + i, foo='bar')
														
 
															+                          for i, averager in enumerate(averagers)}
														
 
															+    for future in futures:
														
 
															+        gathered = future.result()
														
 
															+
														
 
															+        assert len(gathered) == 4
														
 
															+
														
 
															+        for endpoint in gathered:
														
 
															+            assert gathered[endpoint] == reference_metadata[endpoint]
														
 
															+
														
 
															+
														
 
															 @pytest.mark.forked
														
 
															 @pytest.mark.asyncio
														
 
															 async def test_allreduce_protocol():
														
@@ -72,7 +100,8 @@ async def test_allreduce_protocol():
 
															     group_id = random.getrandbits(160).to_bytes(length=20, byteorder='big')
														
 
															     allreduce_protocols = [AllReduceProtocol(
														
 
															-        group_id=group_id, endpoint=peer, tensors=tensors_by_peer[peer], ordered_group_endpoints=peers)
														
 
															+        group_id=group_id, endpoint=peer, tensors=tensors_by_peer[peer],
														
 
															+        ordered_group_endpoints=peers, part_sizes=(150, 200, 67))
														
 
															         for peer in peers]
														
 
															     async def _accumulate(sender: Endpoint, recipient: Endpoint):
														
@@ -112,10 +141,52 @@ def test_partitioning():
 
															         if total_size == 0:
														
 
															             continue
														
 
															         num_chunks = random.randint(1, min(1000, sum(x.numel() for x in tensors)))
														
 
															-        chunks = split_into_parts(tensors, group_size=num_chunks)
														
 
															+        part_sizes = load_balance_peers(total_size, [None] * num_chunks)
														
 
															+        chunks = split_into_parts(tensors, part_sizes)
														
 
															         assert len(chunks) == num_chunks
														
 
															         shapes = [tensor.shape for tensor in tensors]
														
 
															         restored = restore_from_parts(chunks, shapes)
														
 
															         assert len(restored) == len(tensors)
														
 
															         assert all(new.shape == old.shape for new, old in zip(restored, tensors))
														
 
															         assert all(torch.allclose(new, old) for new, old in zip(restored, tensors))
														
 
															+
														
 
															+
														
 
															+def get_cost(vector_size, partitions, throughputs):
														
 
															+    return max((vector_size - partitions[i] + (len(partitions) - 1) * partitions[i]) / max(throughputs[i], 1e-9)
														
 
															+               for i in range(len(partitions)))
														
 
															+
														
 
															+
														
 
															+def check_optimality(vector_size, throughputs, ref_partitions):
														
 
															+    partitions = list(load_balance_peers(vector_size, throughputs))
														
 
															+    assert get_cost(vector_size, partitions, throughputs) <= get_cost(vector_size, ref_partitions, throughputs)
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+def test_load_balancing():
														
 
															+    check_optimality(60, np.array([0.25, 0.25, 0.25, 0.25]), [15, 15, 15, 15])
														
 
															+    check_optimality(1024, np.array([0.3, 0.5, 0.9]), [0, 255, 769])
														
 
															+    check_optimality(60, np.array([0.44, 0.33, 0.22]), [42, 18, 0])
														
 
															+    check_optimality(60, np.array([0.55, 0.44, 0.40]), [35, 16, 9])
														
 
															+    check_optimality(1024 * 1024, np.array([0.3, 0.5, 0.9, 0.6]), [0, 169327, 602629, 276620])
														
 
															+    check_optimality(1024 * 1024, np.array([0.0, 0.5, 0.0, 0.6]), [0, 428963, 0, 619613])
														
 
															+    assert load_balance_peers(60, np.array([0.55, 0.44, 0.40]), min_size=10) == (41, 19, 0)
														
 
															+    assert load_balance_peers(60, np.array([0.32, 0.55, 0.44]), min_size=10) == (0, 40, 20)
														
 
															+    assert load_balance_peers(2, np.array([0.55, 0.20, 0.44]), min_size=10) == (1, 0, 1)
														
 
															+    assert load_balance_peers(1, np.array([0.55, 0.20, 0.44]), min_size=10) == (1, 0, 0)
														
 
															+
														
 
															+    assert load_balance_peers(100, (None, None)) == (50, 50)
														
 
															+    assert load_balance_peers(100, (None, None, None, None, None)) == (20, 20, 20, 20, 20)
														
 
															+    assert load_balance_peers(100, (0, 0, 0, None, None)) == (0, 0, 0, 50, 50)
														
 
															+
														
 
															+    with pytest.raises(AssertionError):
														
 
															+        load_balance_peers(100, (0, 0, 0))
														
 
															+
														
 
															+    for i in range(10):
														
 
															+        vector_size = np.random.randint(1, 1024 ** 3)
														
 
															+        num_peers = np.random.randint(1, 256)
														
 
															+        scale = 1e-9 + np.random.rand() * 1e5
														
 
															+        throughputs = np.random.rand(num_peers) * scale + 1e-6
														
 
															+        min_size = np.random.choice([0, np.random.randint(0, vector_size // 10)])
														
 
															+        assignment = load_balance_peers(vector_size, throughputs, min_size)
														
 
															+        assert np.sum(assignment) == vector_size
														
 
															+        assert np.min(assignment) >= 0