4 жил өмнө · fee619527f
--- a/hivemind/averaging/accumulators.py
+++ b/hivemind/averaging/accumulators.py
@@ -0,0 +1,112 @@
 
				+import dataclasses
			
 
				+from abc import ABC
			
 
				+from typing import Callable, Optional
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+
			
 
				+class AccumulatorBase(ABC):
			
 
				+    def accumulate_part(self, tensor: torch.Tensor, weight: float) -> None:
			
 
				+        ...
			
 
				+
			
 
				+    def reduce(self) -> torch.Tensor:
			
 
				+        ...
			
 
				+
			
 
				+
			
 
				+AccumulatorFactory = Callable[[torch.Size, int], AccumulatorBase]
			
 
				+
			
 
				+
			
 
				+class MeanAccumulator(AccumulatorBase):
			
 
				+    def __init__(self, part_shape: torch.Size, _n_peers: int):
			
 
				+        self._accumulator = torch.zeros(part_shape)
			
 
				+        self._denominator = 0.0
			
 
				+
			
 
				+    def accumulate_part(self, tensor_part: torch.Tensor, weight: float) -> None:
			
 
				+        self._accumulator.add_(tensor_part, alpha=weight)
			
 
				+        self._denominator += weight
			
 
				+
			
 
				+    def reduce(self) -> torch.Tensor:
			
 
				+        return self._accumulator.div_(self._denominator)
			
 
				+
			
 
				+
			
 
				+class CenteredClipAccumulator(AccumulatorBase):
			
 
				+    def __init__(self, part_shape: torch.Size, n_peers: int, **kwargs):
			
 
				+        self._kwargs = kwargs
			
 
				+
			
 
				+        self._tensors = torch.empty([n_peers] + part_shape)
			
 
				+        self._weights = torch.empty(n_peers)
			
 
				+        self._index = 0
			
 
				+
			
 
				+    def accumulate_part(self, tensor_part: torch.Tensor, weight: float) -> None:
			
 
				+        self._tensors[self._index] = tensor_part
			
 
				+        self._weights[self._index] = weight
			
 
				+        self._index += 1
			
 
				+
			
 
				+    def reduce(self) -> torch.Tensor:
			
 
				+        clipped = centered_clip(self._tensors, self._weights, **self._kwargs)
			
 
				+        return clipped.result
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass(frozen=True)
			
 
				+class CenteredClipResult:
			
 
				+    result: torch.Tensor
			
 
				+    n_clipped: torch.Tensor
			
 
				+    last_step_delta: torch.Tensor
			
 
				+
			
 
				+
			
 
				+def centered_clip(
			
 
				+    input_tensors: torch.Tensor,
			
 
				+    weights: torch.Tensor,
			
 
				+    tau: float = 1.0,
			
 
				+    n_iters: int = 20,
			
 
				+    stop_delta: Optional[float] = None,
			
 
				+) -> CenteredClipResult:
			
 
				+    """
			
 
				+    Optimized implementation of CenteredClip from [Karimireddy, 2021].
			
 
				+    Intended to be used in a decentralized fashion as in [Gorbunov, 2021].
			
 
				+
			
 
				+    :stop_delta: Stop iterations early if the ``L_inf`` norm of the last step is less than ``stop_delta``.
			
 
				+                 Note: if this option is used, the step norm calculations may increase the time per iteration by ~25%.
			
 
				+
			
 
				+    References:
			
 
				+
			
 
				+    [Karimireddy, 2021] Karimireddy, Sai Praneeth, Lie He, and Martin Jaggi. "Learning from history for byzantine
			
 
				+    robust optimization." International Conference on Machine Learning. PMLR, 2021.
			
 
				+
			
 
				+    [Gorbunov, 2021] Gorbunov, Eduard, Alexander Borzunov, Michael Diskin, and Max Ryabinin.
			
 
				+    "Secure Distributed Training at Scale." arXiv preprint arXiv:2106.11257 (2021).
			
 
				+    """
			
 
				+
			
 
				+    with torch.no_grad():
			
 
				+        n_peers = input_tensors.shape[0]
			
 
				+        result_shape = input_tensors.shape[1:]
			
 
				+
			
 
				+        input_tensors = input_tensors.flatten(start_dim=1)
			
 
				+        weights /= weights.sum()
			
 
				+
			
 
				+        # This finds medians faster than torch.median() and torch.quantile(q=0.5),
			
 
				+        # see https://github.com/pytorch/pytorch/issues/51450
			
 
				+        sorted_tensors = input_tensors.sort(dim=0).values
			
 
				+        result = sorted_tensors[n_peers // 2].clone()
			
 
				+        delta = None
			
 
				+
			
 
				+        diff = torch.sub(input_tensors, result, out=sorted_tensors)  # Reuse memory from `sorted_tensors`
			
 
				+        for _ in range(n_iters):
			
 
				+            norms = diff.norm(dim=1)
			
 
				+            coeffs = weights * torch.minimum(torch.tensor(1.0), tau / norms)
			
 
				+
			
 
				+            if stop_delta is not None:
			
 
				+                prev_diff = result[...] = diff[0]  # Reuse memory from `result`
			
 
				+
			
 
				+            # We only need to update `diff` (not `result`) between iterations
			
 
				+            diff.addmm_(-coeffs.repeat(n_peers, 1), diff)
			
 
				+
			
 
				+            if stop_delta is not None:
			
 
				+                delta = prev_diff.sub_(diff[0]).max()
			
 
				+                if delta < stop_delta:
			
 
				+                    break
			
 
				+        torch.sub(input_tensors[0], diff[0], out=result)
			
 
				+
			
 
				+        return CenteredClipResult(
			
 
				+            result=result.reshape(result_shape), n_clipped=(tau < norms).sum(), last_step_delta=delta
			
 
				+        )
			
--- a/hivemind/averaging/allreduce.py
+++ b/hivemind/averaging/allreduce.py
@@ -4,6 +4,7 @@ from typing import Any, AsyncIterator, Dict, Optional, Sequence, Tuple, Type
 
				 
			
 
				 import torch
			
 
				 
			
 
				+from hivemind.averaging.accumulators import AccumulatorFactory
			
 
				 from hivemind.averaging.partition import AllreduceException, TensorPartContainer, TensorPartReducer
			
 
				 from hivemind.compression import deserialize_torch_tensor, serialize_torch_tensor
			
 
				 from hivemind.p2p import P2P, P2PContext, PeerID, ServicerBase, StubBase
			
@@ -58,6 +59,7 @@ class AllReduceRunner(ServicerBase):
 
				         tensors: Sequence[torch.Tensor],
			
 
				         ordered_peer_ids: Sequence[PeerID],
			
 
				         peer_fractions: Tuple[float, ...],
			
 
				+        accumulator_factory: AccumulatorFactory,
			
 
				         weights: Optional[Sequence[float]] = None,
			
 
				         modes: Optional[Sequence[AveragingMode]] = None,
			
 
				         gathered: Optional[Dict[PeerID, Any]] = None,
			
@@ -97,7 +99,8 @@ class AllReduceRunner(ServicerBase):
 
				         self.tensor_part_reducer = TensorPartReducer(
			
 
				             tuple(part.shape for part in self.parts_for_local_averaging),
			
 
				             len(self.sender_peer_ids),
			
 
				-            self.sender_weights,
			
 
				+            weights=self.sender_weights,
			
 
				+            accumulator_factory=accumulator_factory,
			
 
				         )
			
 
				 
			
 
				     def __repr__(self):
			
--- a/hivemind/averaging/averager.py
+++ b/hivemind/averaging/averager.py
@@ -15,6 +15,7 @@ from typing import Any, AsyncIterator, Dict, Optional, Sequence, Tuple, Union
 
				 import numpy as np
			
 
				 import torch
			
 
				 
			
 
				+from hivemind.averaging.accumulators import AccumulatorFactory, MeanAccumulator
			
 
				 from hivemind.averaging.allreduce import AllreduceException, AllReduceRunner, AveragingMode, GroupID
			
 
				 from hivemind.averaging.group_info import GroupInfo
			
 
				 from hivemind.averaging.load_balancing import load_balance_peers
			
@@ -112,6 +113,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				         compression: CompressionBase = NoCompression(),
			
 
				         state_compression: CompressionBase = NoCompression(),
			
 
				         tensor_infos: Optional[Sequence[CompressionInfo]] = None,
			
 
				+        accumulator_factory: AccumulatorFactory = MeanAccumulator,
			
 
				         bandwidth: Optional[float] = None,
			
 
				         min_vector_size: int = 0,
			
 
				         auxiliary: bool = False,
			
@@ -170,6 +172,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				             compression=compression,
			
 
				             part_size_bytes=part_size_bytes,
			
 
				             min_vector_size=min_vector_size,
			
 
				+            accumulator_factory=accumulator_factory,
			
 
				         )
			
 
				         self._averaging_alpha, self._allreduce_timeout = averaging_alpha, allreduce_timeout
			
 
				         self._running_groups: Dict[GroupID, AllReduceRunner] = {}  # one or more assembled groups that run all-reduce
			
--- a/hivemind/averaging/partition.py
+++ b/hivemind/averaging/partition.py
@@ -8,6 +8,7 @@ from typing import AsyncIterable, AsyncIterator, Optional, Sequence, Tuple, Type
 
				 import numpy as np
			
 
				 import torch
			
 
				 
			
 
				+from hivemind.averaging.accumulators import AccumulatorFactory
			
 
				 from hivemind.compression import CompressionBase, CompressionInfo, NoCompression
			
 
				 from hivemind.proto import runtime_pb2
			
 
				 from hivemind.utils.asyncio import amap_in_executor
			
@@ -171,16 +172,23 @@ class TensorPartReducer:
 
				     :note: even if local peer is not sending data, local parts will be used for shape information
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, part_shapes: Sequence[torch.Size], num_senders: int, weights: Optional[Sequence[float]] = None):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        part_shapes: Sequence[torch.Size],
			
 
				+        num_senders: int,
			
 
				+        *,
			
 
				+        weights: Optional[Sequence[float]],
			
 
				+        accumulator_factory: AccumulatorFactory,
			
 
				+    ):
			
 
				         self.part_shapes, self.num_senders, self.num_parts = part_shapes, num_senders, len(part_shapes)
			
 
				         self.weights = tuple(weights or (1 for _ in range(num_senders)))
			
 
				         assert len(self.weights) == self.num_senders, "The number of weights is inconsistent with num_senders"
			
 
				         assert all(isinstance(weight, (int, float)) for weight in self.weights)
			
 
				         self.current_part_index = -1  # index in local_parts of the part that should be loaded next
			
 
				         self.current_part_accumulated_from = 0  # number of peers from which the current part was accumulated
			
 
				-        self.accumulator = None  # this will contain the sum of current tensor part from group peers
			
 
				-        self.denominator = 0.0  # total weight accumulated from all peers for current part
			
 
				         self.current_part_future = asyncio.Future()
			
 
				+        self.accumulator_factory = accumulator_factory
			
 
				+        self.accumulator = None
			
 
				         self.finished = asyncio.Event()
			
 
				         self.reset_accumulators()
			
 
				 
			
@@ -194,8 +202,7 @@ class TensorPartReducer:
 
				         self.current_part_index += 1
			
 
				         self.current_part_accumulated_from = 0
			
 
				         self.current_part_future = asyncio.Future()
			
 
				-        self.accumulator = torch.zeros(self.part_shapes[self.current_part_index])
			
 
				-        self.denominator = 0.0
			
 
				+        self.accumulator = self.accumulator_factory(self.part_shapes[self.current_part_index], self.num_senders)
			
 
				 
			
 
				     async def accumulate_part(self, sender_index: int, part_index: int, tensor_part: torch.Tensor) -> torch.Tensor:
			
 
				         """Add vector part to accumulator, wait for all other vectors to be added, then return the average part"""
			
@@ -211,13 +218,12 @@ class TensorPartReducer:
 
				 
			
 
				         current_part_future = self.current_part_future
			
 
				 
			
 
				-        self.accumulator.add_(tensor_part, alpha=self.weights[sender_index])
			
 
				-        self.denominator += self.weights[sender_index]
			
 
				+        self.accumulator.accumulate_part(tensor_part, self.weights[sender_index])
			
 
				         self.current_part_accumulated_from += 1
			
 
				 
			
 
				         assert self.current_part_accumulated_from <= self.num_senders
			
 
				         if self.current_part_accumulated_from == self.num_senders:
			
 
				-            current_part_future.set_result(self.accumulator.div_(self.denominator))
			
 
				+            current_part_future.set_result(self.accumulator.reduce())
			
 
				             self.reset_accumulators()
			
 
				         return await current_part_future
			
 
				 
			
@@ -225,7 +231,7 @@ class TensorPartReducer:
 
				         if not self.finished.is_set():
			
 
				             if hasattr(self, "current_part_future"):
			
 
				                 self.current_part_future.cancel()
			
 
				-                del self.accumulator
			
 
				+                self.accumulator = None
			
 
				             self.finished.set()
			
 
				 
			
 
				     def __del__(self):
			
--- a/tests/test_allreduce.py
+++ b/tests/test_allreduce.py
@@ -7,6 +7,7 @@ import pytest
 
				 import torch
			
 
				 
			
 
				 from hivemind import Quantile8BitQuantization, aenumerate
			
 
				+from hivemind.averaging.accumulators import MeanAccumulator
			
 
				 from hivemind.averaging.allreduce import AllReduceRunner, AveragingMode
			
 
				 from hivemind.averaging.partition import TensorPartContainer, TensorPartReducer
			
 
				 from hivemind.compression import deserialize_torch_tensor
			
@@ -119,7 +120,7 @@ async def test_partitioning_asynchronous():
 
				 @pytest.mark.asyncio
			
 
				 async def test_reducer(num_senders: int, num_parts: int, synchronize_prob: float):
			
 
				     tensor_part_shapes = [torch.Size([i]) for i in range(num_parts)]
			
 
				-    reducer = TensorPartReducer(tensor_part_shapes, num_senders)
			
 
				+    reducer = TensorPartReducer(tensor_part_shapes, num_senders, weights=None, accumulator_factory=MeanAccumulator)
			
 
				 
			
 
				     local_tensors_by_sender = [[torch.randn(i) for i in range(num_parts)] for j in range(num_senders)]
			
 
				 
			
@@ -196,6 +197,7 @@ async def test_allreduce_protocol(peer_modes, averaging_weights, peer_fractions,
 
				             tensors=[x.clone() for x in tensors_by_peer[p2p.peer_id]],
			
 
				             ordered_peer_ids=peers,
			
 
				             peer_fractions=peer_fractions,
			
 
				+            accumulator_factory=MeanAccumulator,
			
 
				             modes=peer_modes,
			
 
				             weights=averaging_weights,
			
 
				             part_size_bytes=part_size_bytes,