浏览代码

Update readthedocs with hivemind.optim (#288)

Co-authored-by: justheuristic <justheuristic@gmail.com>
Michael Diskin 4 年之前
父节点
当前提交
cc8d39c2ea

+ 1 - 1
docs/modules/averaging.rst

@@ -4,10 +4,10 @@
 .. automodule:: hivemind.averaging
 
 .. currentmodule:: hivemind.averaging
-
 .. raw:: html
 
   This module lets you average tensors in a decentralized manner.
+  <br><br>
 
 .. autoclass:: DecentralizedAverager
    :members:

+ 2 - 2
docs/modules/client.rst

@@ -1,5 +1,5 @@
-**hivemind.client**
-====================
+**hivemind.moe.client**
+=======================
 
 .. automodule:: hivemind.moe.client
 

+ 3 - 2
docs/modules/index.rst

@@ -5,7 +5,8 @@
 .. toctree::
    :maxdepth: 2
 
+   optim
    averaging
+   dht
    client
-   server
-   dht
+   server

+ 18 - 0
docs/modules/optim.rst

@@ -0,0 +1,18 @@
+**hivemind.optim**
+==================
+
+.. automodule:: hivemind.optim
+.. currentmodule:: hivemind.optim
+
+.. raw:: html
+
+  This module contains decentralized optimizers that wrap regular pytorch optimizers to collaboratively train a shared model. Depending on the exact type, optimizer may average model parameters with peers, exchange gradients, or follow a more complicated distributed training strategy.
+  <br><br>
+
+.. autoclass:: CollaborativeOptimizer
+   :members: step
+   :member-order: bysource
+
+.. autoclass:: CollaborativeAdaptiveOptimizer
+   :members:
+   :member-order: bysource

+ 4 - 2
hivemind/optim/adaptive.py

@@ -9,8 +9,10 @@ from hivemind import TrainingAverager
 class CollaborativeAdaptiveOptimizer(CollaborativeOptimizer):
     """
     Behaves exactly as CollaborativeOptimizer except:
-     - averages adaptive learning rates of an optimizer
-     - doesn't average gradients
+
+    * averages adaptive learning rates of an optimizer
+    * doesn't average gradients
+
     :param average_opt_statistics: average optimizer statistics with corresponding names in statedict
     :param kwargs: options for CollaborativeOptimizer
     """

+ 5 - 4
hivemind/optim/collaborative.py

@@ -63,8 +63,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
     :note: This optimizer behaves unlike regular pytorch optimizers in two ways:
 
-    - calling .step will periodically zero-out gradients w.r.t. model parameters after each step
-    - it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
+      * calling .step will periodically zero-out gradients w.r.t. model parameters after each step
+      * it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
+
 
     :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
     :param dht: a running hivemind.DHT daemon connected to other peers
@@ -76,7 +77,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
     :param default_refresh_period: if no peers are detected, attempt to fetch collaboration state this often (seconds)
     :param expected_drift_peers: assume that this many new peers can join between steps
     :param expected_drift_rate: assumes that this fraction of current collaboration can join/leave between steps
-    :note: the expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
+    :note: The expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
       refresh the collaboration-wide statistics (to avoid missing the moment when to run the next step)
     :param bandwidth: peer's network bandwidth for the purpose of load balancing (recommended: internet speed in mbps)
     :param step_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
@@ -92,7 +93,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
      the cost of extra time per step. If reuse_gradient_accumulators is True, this parameter has no effect.
     :param client_mode: if True, runs training without incoming connections, in a firewall-compatible mode
     :param kwargs: additional parameters forwarded to DecentralizedAverager
-    :note: if you are using CollaborativeOptimizer with a lr_scheduler, it is recommended to pass this scheduler
+    :note: If you are using CollaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
     """