4 rokov pred · 2359906253
--- a/hivemind/client/optim/collaborative.py
+++ b/hivemind/client/optim/collaborative.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
				 import warnings
			
 
				 from dataclasses import dataclass
			
 
				 from threading import Thread, Lock, Event
			
 
				-from typing import Optional, Type
			
 
				+from typing import Optional, Iterator
			
 
				 import logging
			
 
				 
			
 
				 import torch
			
@@ -11,7 +11,7 @@ import numpy as np
 
				 from hivemind.dht import DHT
			
 
				 from hivemind.client.optim.base import DecentralizedOptimizerBase
			
 
				 from hivemind.client.averaging.training import TrainingAverager
			
 
				-from hivemind.utils import get_logger, get_dht_time, run_in_background, ValueWithExpiration
			
 
				+from hivemind.utils import get_logger, get_dht_time, ValueWithExpiration
			
 
				 from hivemind.client.optim.performance_ema import PerformanceEMA
			
 
				 
			
 
				 logger = get_logger(__name__)
			
@@ -47,7 +47,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				 
			
 
				     :note: This optimizer behaves unlike regular pytorch optimizers in two ways:
			
 
				 
			
 
				-    - calling .step will periodially zero-out gradients w.r.t. model parameters after each step
			
 
				+    - calling .step will periodically zero-out gradients w.r.t. model parameters after each step
			
 
				     - it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
			
 
				 
			
 
				     :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
			
@@ -55,7 +55,6 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				     :param prefix: a common prefix for all metadata stored by CollaborativeOptimizer in the DHT
			
 
				     :param target_batch_size: perform optimizer step after all peers collectively accumulate this many samples
			
 
				     :param batch_size_per_step: before each call to .step, user should accumulate gradients over this many samples
			
 
				-    :param target_group_size: maximum group size for DecentralizedAverager's all-reduce
			
 
				     :param min_refresh_period: wait for at least this many seconds before fetching new collaboration state
			
 
				     :param max_refresh_period: wait for at most this many seconds before fetching new collaboration state
			
 
				     :param default_refresh_period: if no peers are detected, attempt to fetch collaboration state this often (seconds)
			
@@ -69,6 +68,12 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				     :param metadata_expiration: peer's metadata (e.g. samples processed) is stored onto DHT for this many seconds
			
 
				     :param averaging_timeout: if an averaging step hangs for this long, it will be cancelled.
			
 
				     :param scheduler: if specified, use this scheduler to update optimizer learning rate
			
 
				+    :param reuse_grad_buffers: if True, use model's .grad buffers for gradient accumulation.
			
 
				+      This is more memory efficient, but it requires that the user does *NOT* call model/opt zero_grad at all
			
 
				+    :param accumulate_grads_on: if specified, accumulate gradients on this device. By default, this will use the same
			
 
				+     device as model parameters. One can specify a different device (e.g. 'cpu' vs 'cuda') to save device memory at
			
 
				+     the cost of extra time per step. If reuse_gradient_accumulators is True, this parameter has no effect.
			
 
				+    :param kwargs: additional parameters forwarded to DecentralizedAverager
			
 
				     :note: if you are using CollaborativeOptimizer with a lr_scheduler, it is recommended to pass this scheduler
			
 
				       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
			
 
				     """
			
@@ -78,14 +83,17 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				                  min_refresh_period: float = 0.5, max_refresh_period: float = 30, default_refresh_period: float = 3,
			
 
				                  expected_drift_peers: float = 3, expected_drift_rate: float = 0.2, performance_ema_alpha: float = 0.1,
			
 
				                  metadata_expiration: float = 30.0, averaging_timeout: Optional[float] = None, verbose: bool = False,
			
 
				-                 **kwargs):
			
 
				+                 reuse_grad_buffers: bool = False, accumulate_grads_on: Optional[torch.device] = None, **kwargs):
			
 
				         super().__init__(opt, dht)
			
 
				+        if reuse_grad_buffers and accumulate_grads_on is not None:
			
 
				+            logger.warning("Setting 'accumulate_grads_on' has no effect if reuse_grad_buffers=True")
			
 
				         self.prefix, self.scheduler = prefix, scheduler
			
 
				         self.target_batch_size, self.batch_size_per_step = target_batch_size, batch_size_per_step
			
 
				         self.min_refresh_period, self.max_refresh_period, self.default_refresh_period =\
			
 
				             min_refresh_period, max_refresh_period, default_refresh_period
			
 
				         self.expected_drift_peers, self.expected_drift_rate = expected_drift_peers, expected_drift_rate
			
 
				         self.averaging_timeout, self.metadata_expiration = averaging_timeout, metadata_expiration
			
 
				+        self._grads, self.reuse_grad_buffers, self.accumulate_grads_on = None, reuse_grad_buffers, accumulate_grads_on
			
 
				         self.status_loglevel = logging.INFO if verbose else logging.DEBUG
			
 
				         self.averager = self._make_averager(**kwargs)
			
 
				 
			
@@ -134,9 +142,12 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				         :param batch_size: optional override for batch_size_per_step from init
			
 
				         :note: this .step is different from normal pytorch optimizers in several key ways. See __init__ for details.
			
 
				         """
			
 
				-        if batch_size is not None and self.batch_size_per_step is None:
			
 
				-            raise ValueError("Please either set batch_size_per_step parameter at init or provide batch_size in .step")
			
 
				-        batch_size = self.batch_size_per_step if batch_size is None else batch_size
			
 
				+        if self.batch_size_per_step is None:
			
 
				+            if batch_size is None:
			
 
				+                raise ValueError("Please either set batch_size_per_step parameter at init or when calling .step")
			
 
				+            logger.log(self.status_loglevel, f"Setting default batch_size_per_step to {batch_size}")
			
 
				+            self.batch_size_per_step = batch_size
			
 
				+        batch_size = batch_size if batch_size is not None else self.batch_size_per_step
			
 
				 
			
 
				         if not self.is_synchronized:
			
 
				             self.load_state_from_peers()
			
@@ -146,6 +157,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				             logger.warning(f"Training step took {get_dht_time() - self.last_step_time}, "
			
 
				                            f"but metadata expired in {self.metadata_expiration} s.")
			
 
				 
			
 
				+        self.accumulate_grads_(batch_size)
			
 
				         with self.lock_local_progress:
			
 
				             self.local_samples_accumulated += batch_size
			
 
				             self.local_steps_accumulated += 1
			
@@ -164,6 +176,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				             return
			
 
				 
			
 
				         with self.performance_ema.pause(), self.lock_collaboration_state:
			
 
				+            # divide accumulators by local steps to recover the true average grad w.r.t. local_samples_accumulated
			
 
				+            self.apply_accumulated_grads_(scale_by=1. / self.local_steps_accumulated)
			
 
				+
			
 
				             if self.collaboration_state.num_peers > 1:
			
 
				                 mean_samples_per_worker = self.target_batch_size / self.collaboration_state.num_peers
			
 
				                 weight = self.local_samples_accumulated / mean_samples_per_worker
			
@@ -176,6 +191,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				 
			
 
				             self.opt.step()
			
 
				             self.opt.zero_grad()
			
 
				+            self.reset_accumulated_grads_()
			
 
				             self.local_samples_accumulated = self.local_steps_accumulated = 0
			
 
				             self.collaboration_state.register_step()
			
 
				             self.collaboration_state_updated.set()
			
@@ -184,6 +200,46 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				             logger.log(self.status_loglevel, f"Optimizer step: done!")
			
 
				             return output
			
 
				 
			
 
				+    def _grad_buffers(self) -> Iterator[torch.Tensor]:
			
 
				+        """ pytorch-internal gradient buffers """
			
 
				+        for param_group in self.opt.param_groups:
			
 
				+            for param in param_group['params']:
			
 
				+                if param.grad is None:
			
 
				+                    yield torch.zeros_like(param)
			
 
				+                else:
			
 
				+                    yield param.grad
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def accumulated_grads(self) -> Iterator[torch.Tensor]:
			
 
				+        """ local gradient accumulators """
			
 
				+        if self.reuse_grad_buffers:
			
 
				+            yield from self._grad_buffers()
			
 
				+        elif self._grads is None:
			
 
				+            with torch.no_grad():
			
 
				+                self._grads = [torch.zeros_like(grad, device=self.accumulate_grads_on) for grad in self._grad_buffers()]
			
 
				+        yield from self._grads
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def accumulate_grads_(self, batch_size: int):
			
 
				+        """ add current gradients to grad accumulators (if any) """
			
 
				+        if self.reuse_grad_buffers:
			
 
				+            return  # user is responsible for accumulating gradients in .grad buffers
			
 
				+        alpha = float(batch_size) / self.batch_size_per_step
			
 
				+        for grad_buf, grad_acc in zip(self._grad_buffers(), self.accumulated_grads()):
			
 
				+            grad_acc.add_(grad_buf.to(grad_acc.device), alpha=alpha)
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def apply_accumulated_grads_(self, scale_by: Optional[float] = None):
			
 
				+        for grad_buf, grad_acc in zip(self._grad_buffers(), self.accumulated_grads()):
			
 
				+            grad_buf[...] = grad_acc.to(grad_buf.device)
			
 
				+            if scale_by is not None:
			
 
				+                grad_buf.mul_(scale_by)
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def reset_accumulated_grads_(self):
			
 
				+        for grad_buf in self._grad_buffers():
			
 
				+            grad_buf.zero_()
			
 
				+
			
 
				     def report_training_progress(self):
			
 
				         """ Periodically publish metadata and the current number of samples accumulated towards the next step """
			
 
				         while self.is_alive():
			
@@ -235,17 +291,17 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				             if not is_client:
			
 
				                 global_optimizer_step = max(global_optimizer_step, opt_step)
			
 
				 
			
 
				-        total_samples_accumulated = estimated_curent_samples = total_samples_per_second = 0
			
 
				+        total_samples_accumulated = estimated_current_samples = total_samples_per_second = 0
			
 
				 
			
 
				         for opt_step, samples_accumulated, samples_per_second, timestep, is_client in valid_peer_states:
			
 
				             total_samples_per_second += samples_per_second
			
 
				             if opt_step == global_optimizer_step:
			
 
				                 total_samples_accumulated += samples_accumulated
			
 
				-                estimated_curent_samples += samples_accumulated + max(0, current_time - timestep) * samples_per_second
			
 
				+                estimated_current_samples += samples_accumulated + max(0, current_time - timestep) * samples_per_second
			
 
				             # note: we deliberately count only valid peers for samples_accumulated, but all peers for performance;
			
 
				             # the rationale behind this is that outdated peers will synchronize and begin contributing shortly.
			
 
				 
			
 
				-        estimated_samples_remaining = self.target_batch_size - estimated_curent_samples
			
 
				+        estimated_samples_remaining = self.target_batch_size - estimated_current_samples
			
 
				         estimated_time_to_next_step = max(0, estimated_samples_remaining) / total_samples_per_second
			
 
				 
			
 
				         expected_max_peers = max(num_peers + self.expected_drift_peers, num_peers * (1 + self.expected_drift_rate))