3 lat temu · eb7e3f5d30
--- a/hivemind/optim/experimental/optimizer.py
+++ b/hivemind/optim/experimental/optimizer.py
@@ -53,25 +53,27 @@ class Optimizer(torch.optim.Optimizer):
 
				       number of local forward-backward cycles). This is because any device can join midway through training, when
			
 
				       other peers have already made some progress and changed their learning rate accordingly.
			
 
				 
			
 
				-    TODO yozh, the doc below still needs update
			
 
				-
			
 
				-    :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
			
 
				-    :param dht: a running hivemind.DHT daemon connected to other peers
			
 
				-    :param prefix: a common prefix for all metadata stored by CollaborativeOptimizer in the DHT
			
 
				+    :param dht: a running hivemind.DHT instance connected to other peers
			
 
				+    :param prefix: a unique name of this experiment, used as a common prefix for all DHT keys
			
 
				     :param target_batch_size: perform optimizer step after all peers collectively accumulate this many samples
			
 
				     :param batch_size_per_step: before each call to .step, user should accumulate gradients over this many samples
			
 
				-    :param bandwidth: peer's network bandwidth for the purpose of load balancing (recommended: internet speed in mbps)
			
 
				+    :param optimizer: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
			
 
				+    :param param_groups: optional, a list/tuple of parameters or structured param groups for the optimizer
			
 
				+    :param scheduler: if specified, use this scheduler to update optimizer learning rate
			
 
				+    :note: If you are using ColloptaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
			
 
				+      explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
			
 
				+
			
 
				+    :param matchmaking_time: when looking for group, wait for peers to join for up to this many secodns
			
 
				     :param averaging_timeout: if an averaging step hangs for this long, it will be cancelled.
			
 
				     :param load_state_timeout: wait for at most this many seconds before giving up on load_state_from_peers
			
 
				-    :param scheduler: if specified, use this scheduler to update optimizer learning rate
			
 
				-    :param epoch_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
			
 
				     :param reuse_grad_buffers: if True, use model's .grad buffers for gradient accumulation.
			
 
				       This is more memory efficient, but it requires that the user does *NOT* call model/opt zero_grad at all
			
 
				+    :param epoch_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
			
 
				+    :param delay_optimizer_step: if True, run optimizer step in background and apply results in a future step
			
 
				     :param client_mode: if True, runs training without incoming connections, in a firewall-compatible mode
			
 
				-    :param kwargs: additional parameters forwarded to DecentralizedAverager
			
 
				-    :note: If you are using ColloptaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
			
 
				-      explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
			
 
				-
			
 
				+    :param averager_opts: additional keyword arguments forwarded to both GradientAverager and TrainingStateAverager
			
 
				+    :param tracker_opts: additional keyword arguments forwarded to ProgressTracker
			
 
				+    :param verbose: if True, report internal events such as accumilating gradients and running background tasks
			
 
				 
			
 
				     Internally, hivemind.Optimizer consists of 4 components:
			
 
				     - DHT, a decentralized key-value storage used for coordination across the swarm
			
@@ -241,6 +243,7 @@ class Optimizer(torch.optim.Optimizer):
 
				                 f"BEFORE: {self.grad_averager.local_samples_accumulated}, {repr([grad.norm() / self.grad_averager.local_times_accumulated for grad in self.grad_averager._grad_accumulators()])}"
			
 
				             )
			
 
				 
			
 
				+
			
 
				             need_averaging = self.tracker.global_progress.num_peers > 1
			
 
				             if need_averaging:
			
 
				                 try: