преди 3 години · 465989bed2
--- a/hivemind/averaging/averager.py
+++ b/hivemind/averaging/averager.py
@@ -16,6 +16,7 @@ import numpy as np
 
				 import torch
			
 
				 
			
 
				 from hivemind.averaging.allreduce import AllreduceException, AllReduceRunner, AveragingMode, GroupID
			
 
				+from hivemind.averaging.control import StepControl, AveragingStage
			
 
				 from hivemind.averaging.group_info import GroupInfo
			
 
				 from hivemind.averaging.load_balancing import load_balance_peers
			
 
				 from hivemind.averaging.matchmaking import Matchmaking, MatchmakingException
			
@@ -303,7 +304,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				         else:
			
 
				             logger.exception("Averager shutdown has no effect: the process is already not alive")
			
 
				 
			
 
				-    async def _shutdown(self, timeout: Optional[float] = None) -> None:
			
 
				+    async def _shutdown(self) -> None:
			
 
				         remaining_tasks = set()
			
 
				         for group in self._running_groups.values():
			
 
				             remaining_tasks.update(group.finalize(cancel=True))
			
@@ -316,68 +317,68 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				     def step(
			
 
				         self,
			
 
				         gather: Optional[GatheredData] = None,
			
 
				+        scheduled_time: Optional[DHTExpiration] = None,
			
 
				         weight: Optional[float] = None,
			
 
				         timeout: Optional[float] = None,
			
 
				         allow_retries: bool = True,
			
 
				+        wait_for_trigger: bool = False,
			
 
				         wait: bool = True,
			
 
				-    ) -> Union[Optional[Dict[PeerID, GatheredData]], MPFuture]:
			
 
				+    ) -> Union[Optional[Dict[PeerID, GatheredData]], StepControl]:
			
 
				         """
			
 
				         Set up the averager to look for a group and run one round of averaging, return True on success, False on failure
			
 
				 
			
 
				         :param gather: optionally send this informaton to all peers in the next group and gather it from every groupmate
			
 
				           (this operation is known as all-gather). The gathered data will be available as the output of this function.
			
 
				+        :param scheduled_time: when matchmaking, assume that all-reduce will begin at this moment
			
 
				         :param weight: averaging weight for this peer, int or float, must be strictly positive
			
 
				         :param allow_retries: if averager fails to run one round of allreduce, this option will allow it to try again
			
 
				           within the specified timeout
			
 
				+        :param wait_for_trigger: if True, await for user to call .allow_allreduce() before running all-reduce
			
 
				         :param timeout: if averager was unable to *find* a group in this many seconds, consider allreduce failedK
			
 
				-        :param wait: if True (default), return when finished. Otherwise return MPFuture and run in background.
			
 
				+        :param wait: if True (default), return when finished. Otherwise return StepControl and run in background.
			
 
				         :returns: on success, update averaged_tensors and return group info; on failure, return None
			
 
				         """
			
 
				         if self.mode == AveragingMode.AUX and weight is not None:
			
 
				             logger.warning("Averager is running in auxiliary mode, weight is unused.")
			
 
				+        if scheduled_time is None:
			
 
				+            scheduled_time = get_dht_time() + self.matchmaking_kwargs["averaging_expiration"]
			
 
				         if weight is None:
			
 
				             weight = float(self.mode != AveragingMode.AUX)
			
 
				         assert isinstance(weight, (int, float)) and weight >= 0, f"Expected a positive int/float, got {type(weight)}"
			
 
				+        assert not wait_for_trigger or wait, "Non-asynchronous step cannot wait for trigger (use wait=False)"
			
 
				+        gather_binary = self.serializer.dumps(gather)  # serialize here to avoid imports in the averager process
			
 
				 
			
 
				-        future = MPFuture()
			
 
				-        gather_binary = self.serializer.dumps(
			
 
				-            gather
			
 
				-        )  # serialize here to avoid loading modules in the averager process
			
 
				-        self._outer_pipe.send(
			
 
				-            (
			
 
				-                "_step",
			
 
				-                [],
			
 
				-                dict(
			
 
				-                    future=future,
			
 
				-                    gather_binary=gather_binary,
			
 
				-                    weight=weight,
			
 
				-                    allow_retries=allow_retries,
			
 
				-                    timeout=timeout,
			
 
				-                ),
			
 
				-            )
			
 
				-        )
			
 
				-        return future.result() if wait else future
			
 
				+        step = StepControl(scheduled_time, weight, wait_for_trigger=wait_for_trigger,
			
 
				+                           gather_binary=gather_binary, timeout=timeout, allow_retries=allow_retries)
			
 
				+        self._outer_pipe.send(("_step", [], dict(step=step)))
			
 
				+        return step.result() if wait else step
			
 
				 
			
 
				     async def _step(
			
 
				-        self, *, future: MPFuture, gather_binary: bytes, weight: float, allow_retries: bool, timeout: Optional[float]
			
 
				+        self, *, step: StepControl, gather_binary: bytes, allow_retries: bool, timeout: Optional[float]
			
 
				     ):
			
 
				         start_time = get_dht_time()
			
 
				 
			
 
				         try:
			
 
				-            while not future.done():
			
 
				+            while not step.done():
			
 
				                 try:
			
 
				                     self._pending_group_assembled.clear()
			
 
				                     data_for_gather = self.serializer.dumps([self.bandwidth, self.mode.value, gather_binary])
			
 
				-                    group_info = await self._matchmaking.look_for_group(
			
 
				-                        timeout=timeout, data_for_gather=data_for_gather
			
 
				-                    )
			
 
				+
			
 
				+                    step.stage = AveragingStage.LOOKING_FOR_GROUP
			
 
				+                    group_info = await self._matchmaking.look_for_group(step)
			
 
				                     if group_info is None:
			
 
				                         raise AllreduceException("Averaging step failed: could not find a group.")
			
 
				 
			
 
				-                    future.set_result(
			
 
				+                    if not step.done():
			
 
				+                        step.stage = AveragingStage.AWAITING_TRIGGER
			
 
				+
			
 
				+                    await step.wait_for_trigger()
			
 
				+                    step.stage = AveragingStage.RUNNING_ALLREDUCE
			
 
				+
			
 
				+                    step.set_result(
			
 
				                         await asyncio.wait_for(
			
 
				                             self._run_allreduce(
			
 
				-                                group_info, tensor_infos=self.tensor_infos, weight=weight, **self.allreduce_kwargs
			
 
				+                                group_info, tensor_infos=self.tensor_infos, weight=step.weight, **self.allreduce_kwargs
			
 
				                             ),
			
 
				                             timeout=self._allreduce_timeout,
			
 
				                         )
			
@@ -396,17 +397,18 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				                     time_elapsed = get_dht_time() - start_time
			
 
				                     if not allow_retries or (timeout is not None and timeout < time_elapsed):
			
 
				                         logger.exception(f"Averager caught {repr(e)}")
			
 
				-                        future.set_exception(e)
			
 
				+                        step.set_exception(e)
			
 
				                     else:
			
 
				                         logger.warning(f"Averager caught {repr(e)}, retrying")
			
 
				 
			
 
				         except BaseException as e:
			
 
				-            if not future.done():
			
 
				-                future.set_exception(e)
			
 
				+            if not step.done():
			
 
				+                step.set_exception(e)
			
 
				             raise
			
 
				         finally:
			
 
				-            if not future.done():
			
 
				-                future.set_exception(
			
 
				+            step.stage = AveragingStage.FINISHED
			
 
				+            if not step.done():
			
 
				+                step.set_exception(
			
 
				                     RuntimeError(
			
 
				                         "Internal sanity check failed: averager.step left future pending."
			
 
				                         " Please report this to hivemind issues."
			
--- a/hivemind/averaging/control.py
+++ b/hivemind/averaging/control.py
@@ -0,0 +1,105 @@
 
				+import struct
			
 
				+from enum import Enum
			
 
				+from typing import Optional
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+from hivemind.utils import MPFuture, DHTExpiration
			
 
				+
			
 
				+
			
 
				+class AveragingStage(Enum):
			
 
				+    IDLE = 0               # still initializing
			
 
				+    LOOKING_FOR_GROUP = 1  # running decentralized matchmaking, can't run allreduce yet
			
 
				+    AWAITING_TRIGGER = 2   # waiting for user to set the trigger that allows running allreduce
			
 
				+    RUNNING_ALLREDUCE = 3  # exchanging tensors with groupmates
			
 
				+    FINISHED = 4           # either done or failed with exception
			
 
				+
			
 
				+
			
 
				+class StepControl(MPFuture):
			
 
				+    """
			
 
				+    An auxiliary data structure that allows user to control stages and track progress in a single averaging step
			
 
				+    TODO description
			
 
				+    :param gather_binary: optionally send this data to all peers in the next group and gather it from groupmates
			
 
				+    :param timeout: maximum time that may be spent looking for group (does not include allreduce itself)
			
 
				+    :returns: an assembled group if successful, None if failed; does NOT perform the actual averaging
			
 
				+
			
 
				+
			
 
				+    """
			
 
				+    def __init__(self, scheduled_time: DHTExpiration, weight: float, wait_for_trigger: bool,
			
 
				+                 gather_binary: bytes, timeout: Optional[float], allow_retries: bool):
			
 
				+        super().__init__()
			
 
				+        self._gather_binary, self._timeout, self._allow_retries = gather_binary, timeout, allow_retries
			
 
				+        self._trigger: Optional[MPFuture] = None
			
 
				+        if not wait_for_trigger:
			
 
				+            self.allow_allreduce()
			
 
				+        self._metadata = torch.zeros([18], dtype=torch.uint8).share_memory_()
			
 
				+        self.stage = AveragingStage.IDLE
			
 
				+        self.scheduled_time = scheduled_time
			
 
				+        self.weight = weight
			
 
				+        self.can_modify = True
			
 
				+
			
 
				+    def _attach_trigger(self, trigger: MPFuture):
			
 
				+        assert self._trigger is None
			
 
				+        self._trigger = trigger
			
 
				+
			
 
				+    def allow_allreduce(self):
			
 
				+        """Allows averager to begin allreduce when it finds a group."""
			
 
				+        self._trigger.set_result(None)
			
 
				+
			
 
				+    async def wait_for_trigger(self):
			
 
				+        await self._trigger
			
 
				+
			
 
				+    @property
			
 
				+    def scheduled_time(self) -> DHTExpiration:
			
 
				+        return struct.unpack('d', self._metadata[0:8].numpy().data)[0]
			
 
				+
			
 
				+    @scheduled_time.setter
			
 
				+    def scheduled_time(self, scheduled_time):
			
 
				+        assert self.can_modify, "cannot change scheduling after all-reduce has already started"
			
 
				+        #TODO check that scheduled time is still within timeout
			
 
				+        struct.pack_into('d', self._metadata[0:8].numpy().data, 0, float(scheduled_time))
			
 
				+
			
 
				+    @property
			
 
				+    def weight(self) -> float:
			
 
				+        return struct.unpack('d', self._metadata[8:16].numpy().data)[0]
			
 
				+
			
 
				+    @weight.setter
			
 
				+    def weight(self, weight: float):
			
 
				+        assert self.can_modify, "cannot change weights after all-reduce has already started"
			
 
				+        assert weight >= 0 and np.isfinite(weight)
			
 
				+        struct.pack_into('d', self._metadata[8:16].numpy().data, 0, float(weight))
			
 
				+
			
 
				+    @property
			
 
				+    def stage(self) -> AveragingStage:
			
 
				+        return AveragingStage(self._metadata[16].item())
			
 
				+
			
 
				+    @stage.setter
			
 
				+    def stage(self, stage: AveragingStage):
			
 
				+        if stage == AveragingStage.RUNNING_ALLREDUCE:
			
 
				+            self.can_modify = False
			
 
				+        self._metadata[16] = stage.value
			
 
				+
			
 
				+    @property
			
 
				+    def can_modify(self) -> bool:
			
 
				+        return bool(self._metadata[17].item())
			
 
				+
			
 
				+    @can_modify.setter
			
 
				+    def can_modify(self, value: bool):
			
 
				+        self._metadata[17] = int(value)
			
 
				+
			
 
				+    @property
			
 
				+    def gather_binary(self) -> bytes:
			
 
				+        return self._gather_binary
			
 
				+
			
 
				+    @property
			
 
				+    def timeout(self) -> DHTExpiration:
			
 
				+        return self.timeout
			
 
				+
			
 
				+    @property
			
 
				+    def allow_retries(self) -> bool:
			
 
				+        return self._allow_retries
			
 
				+
			
 
				+    def cancel(self) -> bool:
			
 
				+        self._trigger.cancel()
			
 
				+        return self.cancel()
			
--- a/hivemind/averaging/matchmaking.py
+++ b/hivemind/averaging/matchmaking.py
@@ -9,6 +9,7 @@ import random
 
				 from math import isfinite
			
 
				 from typing import AsyncIterator, Dict, Optional, Set, Tuple, Type
			
 
				 
			
 
				+from hivemind.averaging.control import StepControl
			
 
				 from hivemind.averaging.group_info import GroupInfo
			
 
				 from hivemind.averaging.key_manager import GroupKey, GroupKeyManager
			
 
				 from hivemind.dht import DHT, DHTID, DHTExpiration
			
@@ -79,7 +80,15 @@ class Matchmaking:
 
				         self.current_leader: Optional[PeerID] = None  # iff i am a follower, this is a link to my current leader
			
 
				         self.current_followers: Dict[PeerID, averaging_pb2.JoinRequest] = {}  # my current followers excluding myself
			
 
				         self.potential_leaders = PotentialLeaders(self.peer_id, averaging_expiration, target_group_size)
			
 
				-        self.data_for_gather: Optional[bytes] = None
			
 
				+        self.step: Optional[StepControl] = None
			
 
				+
			
 
				+    @contextlib.asynccontextmanager
			
 
				+    def looking_for_group(self, step: StepControl):
			
 
				+        async with self.lock_looking_for_group:
			
 
				+            assert self.step is None
			
 
				+            self.step = step
			
 
				+            yield
			
 
				+            self.step = None
			
 
				 
			
 
				     @property
			
 
				     def is_looking_for_group(self):
			
@@ -98,10 +107,9 @@ class Matchmaking:
 
				             f" current key = {self.group_key_manager.current_key}, client_mode={self.client_mode})"
			
 
				         )
			
 
				 
			
 
				-    async def look_for_group(self, *, data_for_gather: bytes, timeout: Optional[float] = None) -> Optional[GroupInfo]:
			
 
				+    async def look_for_group(self, step: StepControl) -> Optional[GroupInfo]:
			
 
				         """
			
 
				-        :param data_for_gather: optionally send this data to all peers in the next group and gather it from groupmates
			
 
				-        :param timeout: maximum time that may be spent looking for group (does not include allreduce itself)
			
 
				+        :param step: step parameters and user control structure for the current step
			
 
				         :returns: an assembled group if successful, None if failed; does NOT perform the actual averaging
			
 
				         Iterate over the averagers from a given group_identifier that have higher leadership priority than yourself.
			
 
				         """
			
@@ -110,11 +118,10 @@ class Matchmaking:
 
				                 "Another look_for_group is already in progress. The current run will be scheduled after"
			
 
				                 " the existing group is either assembled or disbanded."
			
 
				             )
			
 
				-        async with self.lock_looking_for_group:
			
 
				-            self.data_for_gather = data_for_gather
			
 
				-            request_leaders_task = asyncio.create_task(self._request_join_potential_leaders(timeout))
			
 
				+        async with self.looking_for_group(step):
			
 
				+            request_leaders_task = asyncio.create_task(self._request_join_potential_leaders(step))
			
 
				             try:
			
 
				-                return await asyncio.wait_for(self.assembled_group, timeout=timeout)
			
 
				+                return await asyncio.wait_for(self.assembled_group, timeout=step.timeout)
			
 
				             except asyncio.TimeoutError:
			
 
				                 return None
			
 
				 
			
@@ -136,10 +143,10 @@ class Matchmaking:
 
				                 # note: the code above ensures that we send all followers away before creating new future
			
 
				                 self.assembled_group = asyncio.Future()
			
 
				                 self.was_accepted_to_group.clear()
			
 
				-                self.data_for_gather = None
			
 
				 
			
 
				     async def _request_join_potential_leaders(self, timeout: Optional[float]) -> GroupInfo:
			
 
				         """Request leaders from queue until we find the first runner. This coroutine is meant to run in background."""
			
 
				+        assert self.is_looking_for_group
			
 
				         async with self.potential_leaders.begin_search(self.group_key_manager, timeout, declare=not self.client_mode):
			
 
				             while True:
			
 
				                 try:
			
@@ -185,7 +192,7 @@ class Matchmaking:
 
				                         schema_hash=self.schema_hash,
			
 
				                         expiration=expiration_time,
			
 
				                         client_mode=self.client_mode,
			
 
				-                        gather=self.data_for_gather,
			
 
				+                        gather=self.control.gather_binary,
			
 
				                         group_key=self.group_key_manager.current_key,
			
 
				                     )
			
 
				                 )
			
@@ -352,7 +359,7 @@ class Matchmaking:
 
				         random.shuffle(ordered_peer_ids)
			
 
				 
			
 
				         gathered = tuple(
			
 
				-            self.data_for_gather if peer_id == self.peer_id else self.current_followers[peer_id].gather
			
 
				+            self.control.gather_binary if peer_id == self.peer_id else self.current_followers[peer_id].gather
			
 
				             for peer_id in ordered_peer_ids
			
 
				         )
			
 
				 
			
@@ -401,13 +408,13 @@ class PotentialLeaders:
 
				         self.search_end_time = float("inf")
			
 
				 
			
 
				     @contextlib.asynccontextmanager
			
 
				-    async def begin_search(self, key_manager: GroupKeyManager, timeout: Optional[float], declare: bool = True):
			
 
				+    async def begin_search(self, step: StepControl, key_manager: GroupKeyManager, declare: bool = True):
			
 
				         async with self.lock_search:
			
 
				             self.running.set()
			
 
				-            self.search_end_time = get_dht_time() + timeout if timeout is not None else float("inf")
			
 
				+            self.search_end_time = get_dht_time() + step.timeout if step.timeout is not None else float("inf")
			
 
				             update_queue_task = asyncio.create_task(self._update_queue_periodically(key_manager))
			
 
				             if declare:
			
 
				-                declare_averager_task = asyncio.create_task(self._declare_averager_periodically(key_manager))
			
 
				+                declare_averager_task = asyncio.create_task(self._declare_averager_periodically(step, key_manager))
			
 
				 
			
 
				             try:
			
 
				                 yield self
			
@@ -499,12 +506,12 @@ class PotentialLeaders:
 
				             )
			
 
				             self.update_triggered.clear()
			
 
				 
			
 
				-    async def _declare_averager_periodically(self, key_manager: GroupKeyManager) -> None:
			
 
				+    async def _declare_averager_periodically(self, step: StepControl, key_manager: GroupKeyManager) -> None:
			
 
				         async with self.lock_declare:
			
 
				             try:
			
 
				                 while True:
			
 
				                     await self.running.wait()
			
 
				-
			
 
				+                    #TODO account for scheduled time here!
			
 
				                     new_expiration_time = min(get_dht_time() + self.averaging_expiration, self.search_end_time)
			
 
				                     self.declared_group_key = group_key = key_manager.current_key
			
 
				                     self.declared_expiration_time = new_expiration_time