4 years ago · cc8d39c2ea
--- a/docs/modules/averaging.rst
+++ b/docs/modules/averaging.rst
@@ -4,10 +4,10 @@
 
				 .. automodule:: hivemind.averaging
			
 
				 
			
 
				 .. currentmodule:: hivemind.averaging
			
 
				-
			
 
				 .. raw:: html
			
 
				 
			
 
				   This module lets you average tensors in a decentralized manner.
			
 
				+  <br><br>
			
 
				 
			
 
				 .. autoclass:: DecentralizedAverager
			
 
				    :members:
			
--- a/docs/modules/client.rst
+++ b/docs/modules/client.rst
@@ -1,5 +1,5 @@
 
				-**hivemind.client**
			
 
				-====================
			
 
				+**hivemind.moe.client**
			
 
				+=======================
			
 
				 
			
 
				 .. automodule:: hivemind.moe.client
			
 
				 
			
--- a/docs/modules/index.rst
+++ b/docs/modules/index.rst
@@ -5,7 +5,8 @@
 
				 .. toctree::
			
 
				    :maxdepth: 2
			
 
				 
			
 
				+   optim
			
 
				    averaging
			
 
				+   dht
			
 
				    client
			
 
				-   server
			
 
				-   dht
			
 
				+   server
			
--- a/docs/modules/optim.rst
+++ b/docs/modules/optim.rst
@@ -0,0 +1,18 @@
 
				+**hivemind.optim**
			
 
				+==================
			
 
				+
			
 
				+.. automodule:: hivemind.optim
			
 
				+.. currentmodule:: hivemind.optim
			
 
				+
			
 
				+.. raw:: html
			
 
				+
			
 
				+  This module contains decentralized optimizers that wrap regular pytorch optimizers to collaboratively train a shared model. Depending on the exact type, optimizer may average model parameters with peers, exchange gradients, or follow a more complicated distributed training strategy.
			
 
				+  <br><br>
			
 
				+
			
 
				+.. autoclass:: CollaborativeOptimizer
			
 
				+   :members: step
			
 
				+   :member-order: bysource
			
 
				+
			
 
				+.. autoclass:: CollaborativeAdaptiveOptimizer
			
 
				+   :members:
			
 
				+   :member-order: bysource
			
--- a/hivemind/optim/adaptive.py
+++ b/hivemind/optim/adaptive.py
@@ -9,8 +9,10 @@ from hivemind import TrainingAverager
 
				 class CollaborativeAdaptiveOptimizer(CollaborativeOptimizer):
			
 
				     """
			
 
				     Behaves exactly as CollaborativeOptimizer except:
			
 
				-     - averages adaptive learning rates of an optimizer
			
 
				-     - doesn't average gradients
			
 
				+
			
 
				+    * averages adaptive learning rates of an optimizer
			
 
				+    * doesn't average gradients
			
 
				+
			
 
				     :param average_opt_statistics: average optimizer statistics with corresponding names in statedict
			
 
				     :param kwargs: options for CollaborativeOptimizer
			
 
				     """
			
--- a/hivemind/optim/collaborative.py
+++ b/hivemind/optim/collaborative.py
@@ -63,8 +63,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				 
			
 
				     :note: This optimizer behaves unlike regular pytorch optimizers in two ways:
			
 
				 
			
 
				-    - calling .step will periodically zero-out gradients w.r.t. model parameters after each step
			
 
				-    - it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
			
 
				+      * calling .step will periodically zero-out gradients w.r.t. model parameters after each step
			
 
				+      * it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
			
 
				+
			
 
				 
			
 
				     :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
			
 
				     :param dht: a running hivemind.DHT daemon connected to other peers
			
@@ -76,7 +77,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				     :param default_refresh_period: if no peers are detected, attempt to fetch collaboration state this often (seconds)
			
 
				     :param expected_drift_peers: assume that this many new peers can join between steps
			
 
				     :param expected_drift_rate: assumes that this fraction of current collaboration can join/leave between steps
			
 
				-    :note: the expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
			
 
				+    :note: The expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
			
 
				       refresh the collaboration-wide statistics (to avoid missing the moment when to run the next step)
			
 
				     :param bandwidth: peer's network bandwidth for the purpose of load balancing (recommended: internet speed in mbps)
			
 
				     :param step_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
			
@@ -92,7 +93,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
				      the cost of extra time per step. If reuse_gradient_accumulators is True, this parameter has no effect.
			
 
				     :param client_mode: if True, runs training without incoming connections, in a firewall-compatible mode
			
 
				     :param kwargs: additional parameters forwarded to DecentralizedAverager
			
 
				-    :note: if you are using CollaborativeOptimizer with a lr_scheduler, it is recommended to pass this scheduler
			
 
				+    :note: If you are using CollaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
			
 
				       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
			
 
				     """