浏览代码

Update readthedocs with hivemind.optim (#288)

Co-authored-by: justheuristic <justheuristic@gmail.com>
Michael Diskin 4 年之前
父节点
当前提交
cc8d39c2ea

+ 1 - 1
docs/modules/averaging.rst

@@ -4,10 +4,10 @@
 .. automodule:: hivemind.averaging
 .. automodule:: hivemind.averaging
 
 
 .. currentmodule:: hivemind.averaging
 .. currentmodule:: hivemind.averaging
-
 .. raw:: html
 .. raw:: html
 
 
   This module lets you average tensors in a decentralized manner.
   This module lets you average tensors in a decentralized manner.
+  <br><br>
 
 
 .. autoclass:: DecentralizedAverager
 .. autoclass:: DecentralizedAverager
    :members:
    :members:

+ 2 - 2
docs/modules/client.rst

@@ -1,5 +1,5 @@
-**hivemind.client**
-====================
+**hivemind.moe.client**
+=======================
 
 
 .. automodule:: hivemind.moe.client
 .. automodule:: hivemind.moe.client
 
 

+ 3 - 2
docs/modules/index.rst

@@ -5,7 +5,8 @@
 .. toctree::
 .. toctree::
    :maxdepth: 2
    :maxdepth: 2
 
 
+   optim
    averaging
    averaging
+   dht
    client
    client
-   server
-   dht
+   server

+ 18 - 0
docs/modules/optim.rst

@@ -0,0 +1,18 @@
+**hivemind.optim**
+==================
+
+.. automodule:: hivemind.optim
+.. currentmodule:: hivemind.optim
+
+.. raw:: html
+
+  This module contains decentralized optimizers that wrap regular pytorch optimizers to collaboratively train a shared model. Depending on the exact type, optimizer may average model parameters with peers, exchange gradients, or follow a more complicated distributed training strategy.
+  <br><br>
+
+.. autoclass:: CollaborativeOptimizer
+   :members: step
+   :member-order: bysource
+
+.. autoclass:: CollaborativeAdaptiveOptimizer
+   :members:
+   :member-order: bysource

+ 4 - 2
hivemind/optim/adaptive.py

@@ -9,8 +9,10 @@ from hivemind import TrainingAverager
 class CollaborativeAdaptiveOptimizer(CollaborativeOptimizer):
 class CollaborativeAdaptiveOptimizer(CollaborativeOptimizer):
     """
     """
     Behaves exactly as CollaborativeOptimizer except:
     Behaves exactly as CollaborativeOptimizer except:
-     - averages adaptive learning rates of an optimizer
-     - doesn't average gradients
+
+    * averages adaptive learning rates of an optimizer
+    * doesn't average gradients
+
     :param average_opt_statistics: average optimizer statistics with corresponding names in statedict
     :param average_opt_statistics: average optimizer statistics with corresponding names in statedict
     :param kwargs: options for CollaborativeOptimizer
     :param kwargs: options for CollaborativeOptimizer
     """
     """

+ 5 - 4
hivemind/optim/collaborative.py

@@ -63,8 +63,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
 
 
     :note: This optimizer behaves unlike regular pytorch optimizers in two ways:
     :note: This optimizer behaves unlike regular pytorch optimizers in two ways:
 
 
-    - calling .step will periodically zero-out gradients w.r.t. model parameters after each step
-    - it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
+      * calling .step will periodically zero-out gradients w.r.t. model parameters after each step
+      * it may take multiple .step calls without updating model parameters, waiting for peers to accumulate enough samples
+
 
 
     :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
     :param opt: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
     :param dht: a running hivemind.DHT daemon connected to other peers
     :param dht: a running hivemind.DHT daemon connected to other peers
@@ -76,7 +77,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
     :param default_refresh_period: if no peers are detected, attempt to fetch collaboration state this often (seconds)
     :param default_refresh_period: if no peers are detected, attempt to fetch collaboration state this often (seconds)
     :param expected_drift_peers: assume that this many new peers can join between steps
     :param expected_drift_peers: assume that this many new peers can join between steps
     :param expected_drift_rate: assumes that this fraction of current collaboration can join/leave between steps
     :param expected_drift_rate: assumes that this fraction of current collaboration can join/leave between steps
-    :note: the expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
+    :note: The expected collaboration drift parameters are used to adjust the frequency with which this optimizer will
       refresh the collaboration-wide statistics (to avoid missing the moment when to run the next step)
       refresh the collaboration-wide statistics (to avoid missing the moment when to run the next step)
     :param bandwidth: peer's network bandwidth for the purpose of load balancing (recommended: internet speed in mbps)
     :param bandwidth: peer's network bandwidth for the purpose of load balancing (recommended: internet speed in mbps)
     :param step_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
     :param step_tolerance: a peer can temporarily be delayed by this many steps without being deemed out of sync
@@ -92,7 +93,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
      the cost of extra time per step. If reuse_gradient_accumulators is True, this parameter has no effect.
      the cost of extra time per step. If reuse_gradient_accumulators is True, this parameter has no effect.
     :param client_mode: if True, runs training without incoming connections, in a firewall-compatible mode
     :param client_mode: if True, runs training without incoming connections, in a firewall-compatible mode
     :param kwargs: additional parameters forwarded to DecentralizedAverager
     :param kwargs: additional parameters forwarded to DecentralizedAverager
-    :note: if you are using CollaborativeOptimizer with a lr_scheduler, it is recommended to pass this scheduler
+    :note: If you are using CollaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
     """
     """