Przeglądaj źródła

Support auxiliary peers in CollaborativeOptimizer (#279)

* Support auxiliary peers for our ALBERT training

* Update hivemind/optim/collaborative.py

Co-authored-by: justheuristic <justheuristic@gmail.com>

* Remove unnecessary

* Update hivemind/optim/collaborative.py

Co-authored-by: Max Ryabinin <mryabinin0@gmail.com>

* Fixes for review

* range(len()) -> enumerate

* Update config.yml

Co-authored-by: justheuristic <justheuristic@gmail.com>
Co-authored-by: Max Ryabinin <mryabinin0@gmail.com>
Michael Diskin 4 lat temu
rodzic
commit
86f3c0dd0d

+ 1 - 0
.circleci/config.yml

@@ -68,6 +68,7 @@ jobs:
       - image: circleci/python:3.9.1
     steps:
       - checkout
+      - run: ulimit -n 4096 # temporary workaround for py39
       - restore_cache:
           keys:
             - py39-v1-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}

+ 4 - 1
examples/albert/run_first_peer.py

@@ -17,7 +17,6 @@ import hivemind
 from hivemind.utils.logging import get_logger
 import metrics_utils
 
-
 logger = get_logger(__name__)
 
 
@@ -163,6 +162,10 @@ if __name__ == '__main__':
                        for peer in metrics_dict]
             latest_step = max(item.step for item in metrics)
             if latest_step != current_step:
+                logger.debug(f"Got metrics from {len(metrics)} peers")
+
+                for i, metrics_for_peer in enumerate(metrics):
+                    logger.debug(f"{i} peer {metrics_for_peer}")
                 current_step = latest_step
                 alive_peers = 0
                 num_batches = 0

+ 36 - 2
hivemind/optim/collaborative.py

@@ -232,9 +232,43 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
             self.collaboration_state_updated.set()
             self.update_scheduler()
 
-            logger.log(self.status_loglevel, f"Optimizer step: done!")
+        logger.log(self.status_loglevel, f"Optimizer step: done!")
 
-            return group_info
+        return group_info
+
+    def step_aux(self, **kwargs):
+        """
+        Find and assist other peers in averaging without sending local gradients.
+
+        :note: this .step is different from normal pytorch optimizers in several key ways. See __init__ for details.
+        """
+
+        if not self.collaboration_state.ready_for_step:
+            return
+
+        logger.log(self.status_loglevel,
+                   f"Beginning global optimizer step {self.collaboration_state.optimizer_step}")
+        self.collaboration_state = self.fetch_collaboration_state()
+        self.collaboration_state_updated.set()
+
+        with self.lock_collaboration_state:
+            # divide accumulators by local steps to recover the true average grad w.r.t. local_samples_accumulated
+            current_step, group_info = self.averager.local_step, None
+            try:
+                group_info = self.averager.step(timeout=self.averaging_timeout, **kwargs)
+                if group_info:
+                    logger.log(self.status_loglevel,
+                               f"Averaged tensors successfully with {len(group_info)} peers")
+            except BaseException as e:
+                logger.log(self.status_loglevel, f"Skipped averaging: averaging round failed with {repr(e)}.")
+
+            self.collaboration_state.register_step(current_step + 1)
+            self.averager.local_step = current_step + 1
+            self.collaboration_state_updated.set()
+
+        logger.log(self.status_loglevel, f"Optimizer step: done!")
+
+        return group_info
 
     def _grad_buffers(self) -> Iterator[torch.Tensor]:
         """ pytorch-internal gradient buffers """