4 년 전 · 1754792aad
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
				 from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				 
			
 
				-__version__ = '0.8.13'
			
 
				+__version__ = '0.8.14'
			
--- a/hivemind/client/allreduce.py
+++ b/hivemind/client/allreduce.py
@@ -11,7 +11,7 @@ import torch
 
				 
			
 
				 from hivemind.dht import DHTID, DHTExpiration
			
 
				 from hivemind.utils import Endpoint, get_logger, MSGPackSerializer
			
 
				-from hivemind.utils import TensorDescriptor, deserialize_torch_tensor, serialize_torch_tensor
			
 
				+from hivemind.utils import TensorDescriptor, deserialize_torch_tensor, serialize_torch_tensor, ChannelCache
			
 
				 from hivemind.proto import averaging_pb2, averaging_pb2_grpc, runtime_pb2
			
 
				 
			
 
				 logger = get_logger(__name__)
			
@@ -149,10 +149,8 @@ class GroupAllReduce:
 
				 
			
 
				         return await self.averaged_part
			
 
				 
			
 
				-    def _get(self, peer: Endpoint) -> averaging_pb2_grpc.DecentralizedAveragingStub:
			
 
				-        """ TODO this function is deprecated and will be replaced by a shared channel cache """
			
 
				-        channel = grpc.aio.insecure_channel(peer)
			
 
				-        return averaging_pb2_grpc.DecentralizedAveragingStub(channel)
			
 
				+    def _get_peer_stub(self, peer: Endpoint) -> averaging_pb2_grpc.DecentralizedAveragingStub:
			
 
				+        return ChannelCache.get_stub(peer, averaging_pb2_grpc.DecentralizedAveragingStub, aio=True)
			
 
				 
			
 
				     async def handle_join_request(self, request: averaging_pb2.PeerInfo
			
 
				                                   ) -> AsyncIterator[averaging_pb2.MessageFromLeader]:
			
@@ -220,7 +218,7 @@ class GroupAllReduce:
 
				         assert self.state == ProtocolState.LOOKING_FOR_GROUP
			
 
				         try:
			
 
				             async with self.concurrent_requests_lock:
			
 
				-                stream = self._get(leader).rpc_group_allreduce(self.info)
			
 
				+                stream = self._get_peer_stub(leader).rpc_group_allreduce(self.info)
			
 
				                 message = await stream.read()
			
 
				                 logger.debug(f"{self} - requested {leader} to be my leader, received "
			
 
				                              f"{averaging_pb2.MessageCode.Name(message.code)}")
			
@@ -259,7 +257,7 @@ class GroupAllReduce:
 
				                 self.average_tensor_parts[peer_endpoint] = await self.accumulate(peer_endpoint, local_part)
			
 
				             else:
			
 
				                 serialized_tensor_part = serialize_torch_tensor(local_part, self.compression_type, allow_inplace=False)
			
 
				-                response = await self._get(peer_endpoint).rpc_aggregate_part(averaging_pb2.AveragingData(
			
 
				+                response = await self._get_peer_stub(peer_endpoint).rpc_aggregate_part(averaging_pb2.AveragingData(
			
 
				                     group_id=self.group_id, endpoint=self.info.endpoint, tensor_part=serialized_tensor_part))
			
 
				 
			
 
				                 if response.code == averaging_pb2.ACCEPTED:
			
@@ -279,7 +277,7 @@ class GroupAllReduce:
 
				             code = averaging_pb2.CANCELLED if isinstance(e, asyncio.CancelledError) else averaging_pb2.INTERNAL_ERROR
			
 
				 
			
 
				             async def send_error_to_peer(peer_endpoint):
			
 
				-                await self._get(peer_endpoint).rpc_aggregate_part(averaging_pb2.AveragingData(
			
 
				+                await self._get_peer_stub(peer_endpoint).rpc_aggregate_part(averaging_pb2.AveragingData(
			
 
				                     group_id=self.group_id, endpoint=self.info.endpoint, code=code))
			
 
				             for peer_endpoint in ordered_group_endpoints:
			
 
				                 asyncio.create_task(send_error_to_peer(peer_endpoint))
			
--- a/hivemind/client/expert.py
+++ b/hivemind/client/expert.py
@@ -1,26 +1,21 @@
 
				 import pickle
			
 
				-from functools import lru_cache
			
 
				 from typing import Tuple, Optional, Any, Dict
			
 
				 
			
 
				-import grpc
			
 
				 import torch
			
 
				 import torch.nn as nn
			
 
				 from torch.autograd.function import once_differentiable
			
 
				 
			
 
				 from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
			
 
				 from hivemind.utils import nested_flatten, nested_pack, nested_compare, Endpoint
			
 
				-from hivemind.utils.grpc import serialize_torch_tensor, deserialize_torch_tensor
			
 
				+from hivemind.utils.grpc import serialize_torch_tensor, deserialize_torch_tensor, ChannelCache
			
 
				 
			
 
				 DUMMY = torch.empty(0, requires_grad=True)  # dummy tensor that triggers autograd in RemoteExpert
			
 
				 
			
 
				 
			
 
				-@lru_cache(maxsize=None)
			
 
				 def _get_expert_stub(endpoint: Endpoint, *extra_options: Tuple[str, Any]):
			
 
				     """ Create a gRPC stub to access remote expert or use previously created stub from a process-wide cache """
			
 
				-    channel_options = [
			
 
				-        ('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)
			
 
				-    ] + list(extra_options)
			
 
				-    return runtime_grpc.ConnectionHandlerStub(grpc.insecure_channel(endpoint, options=channel_options))
			
 
				+    channel_options = (('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)) + extra_options
			
 
				+    return ChannelCache.get_stub(endpoint, runtime_grpc.ConnectionHandlerStub, aio=False, options=channel_options)
			
 
				 
			
 
				 
			
 
				 class RemoteExpert(nn.Module):
			
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -26,7 +26,7 @@ import uvloop
 
				 from hivemind.client import RemoteExpert
			
 
				 from hivemind.dht.node import DHTNode, DHTID, DHTExpiration
			
 
				 from hivemind.dht.routing import get_dht_time, DHTValue
			
 
				-from hivemind.dht.storage import ValueWithExpiration
			
 
				+from hivemind.utils.timed_storage import ValueWithExpiration
			
 
				 from hivemind.utils import MPFuture, Endpoint, get_logger
			
 
				 
			
 
				 logger = get_logger(__name__)
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -11,9 +11,9 @@ from sortedcontainers import SortedList
 
				 
			
 
				 from hivemind.dht.protocol import DHTProtocol
			
 
				 from hivemind.dht.routing import DHTID, DHTExpiration, DHTKey, get_dht_time, DHTValue, BinaryDHTValue, Subkey
			
 
				-from hivemind.dht.storage import CacheRefreshQueue, DictionaryDHTValue, ValueWithExpiration
			
 
				+from hivemind.dht.storage import CacheRefreshQueue, DictionaryDHTValue
			
 
				 from hivemind.dht.traverse import traverse_dht
			
 
				-from hivemind.utils import Endpoint, LOCALHOST, MSGPackSerializer, get_logger, SerializerBase
			
 
				+from hivemind.utils import Endpoint, LOCALHOST, MSGPackSerializer, get_logger, SerializerBase, ValueWithExpiration
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -7,9 +7,9 @@ from typing import Optional, List, Tuple, Dict, Any, Sequence, Union, Collection
 
				 import grpc
			
 
				 
			
 
				 from hivemind.dht.routing import RoutingTable, DHTID, BinaryDHTValue, DHTExpiration, Subkey
			
 
				-from hivemind.dht.storage import DHTLocalStorage, DictionaryDHTValue, ValueWithExpiration
			
 
				+from hivemind.dht.storage import DHTLocalStorage, DictionaryDHTValue
			
 
				 from hivemind.proto import dht_pb2, dht_pb2_grpc as dht_grpc
			
 
				-from hivemind.utils import Endpoint, get_logger, replace_port, MSGPackSerializer
			
 
				+from hivemind.utils import Endpoint, get_logger, replace_port, MSGPackSerializer, ChannelCache, ValueWithExpiration
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
@@ -78,10 +78,9 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				         else:
			
 
				             logger.warning("DHTProtocol has no server (due to listen=False), it doesn't need to be shut down")
			
 
				 
			
 
				-    def _get(self, peer: Endpoint) -> dht_grpc.DHTStub:
			
 
				+    def _get_dht_stub(self, peer: Endpoint) -> dht_grpc.DHTStub:
			
 
				         """ get a DHTStub that sends requests to a given peer """
			
 
				-        channel = grpc.aio.insecure_channel(peer, options=self.channel_options)
			
 
				-        return dht_grpc.DHTStub(channel)
			
 
				+        return ChannelCache.get_stub(peer, dht_grpc.DHTStub, aio=True, options=self.channel_options)
			
 
				 
			
 
				     async def call_ping(self, peer: Endpoint) -> Optional[DHTID]:
			
 
				         """
			
@@ -93,7 +92,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				         """
			
 
				         try:
			
 
				             async with self.rpc_semaphore:
			
 
				-                peer_info = await self._get(peer).rpc_ping(self.node_info, timeout=self.wait_timeout)
			
 
				+                peer_info = await self._get_dht_stub(peer).rpc_ping(self.node_info, timeout=self.wait_timeout)
			
 
				         except grpc.aio.AioRpcError as error:
			
 
				             logger.warning(f"DHTProtocol failed to ping {peer}: {error.code()}")
			
 
				             peer_info = None
			
@@ -155,7 +154,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				                                              expiration_time=expiration_time, in_cache=in_cache, peer=self.node_info)
			
 
				         try:
			
 
				             async with self.rpc_semaphore:
			
 
				-                response = await self._get(peer).rpc_store(store_request, timeout=self.wait_timeout)
			
 
				+                response = await self._get_dht_stub(peer).rpc_store(store_request, timeout=self.wait_timeout)
			
 
				             if response.peer and response.peer.node_id:
			
 
				                 peer_id = DHTID.from_bytes(response.peer.node_id)
			
 
				                 asyncio.create_task(self.update_routing_table(peer_id, peer, responded=True))
			
@@ -203,7 +202,7 @@ class DHTProtocol(dht_grpc.DHTServicer):
 
				         find_request = dht_pb2.FindRequest(keys=list(map(DHTID.to_bytes, keys)), peer=self.node_info)
			
 
				         try:
			
 
				             async with self.rpc_semaphore:
			
 
				-                response = await self._get(peer).rpc_find(find_request, timeout=self.wait_timeout)
			
 
				+                response = await self._get_dht_stub(peer).rpc_find(find_request, timeout=self.wait_timeout)
			
 
				             if response.peer and response.peer.node_id:
			
 
				                 peer_id = DHTID.from_bytes(response.peer.node_id)
			
 
				                 asyncio.create_task(self.update_routing_table(peer_id, peer, responded=True))
			
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -5,15 +5,12 @@ import hashlib
 
				 import heapq
			
 
				 import os
			
 
				 import random
			
 
				-import time
			
 
				 from collections.abc import Iterable
			
 
				 from itertools import chain
			
 
				 from typing import Tuple, Optional, List, Dict, Set, Union, Any, Sequence
			
 
				+from hivemind.utils import Endpoint, PickleSerializer, get_dht_time, DHTExpiration
			
 
				 
			
 
				-from hivemind.utils import Endpoint, PickleSerializer
			
 
				-
			
 
				-DHTKey, Subkey, DHTValue, DHTExpiration, BinaryDHTID, BinaryDHTValue, = Any, Any, Any, float, bytes, bytes
			
 
				-get_dht_time = time.time  # time used by all dht functionality. You can replace this with any infrastructure-wide time
			
 
				+DHTKey, Subkey, DHTValue, BinaryDHTID, BinaryDHTValue, = Any, Any, Any, bytes, bytes
			
 
				 
			
 
				 
			
 
				 class RoutingTable:
			
--- a/hivemind/dht/storage.py
+++ b/hivemind/dht/storage.py
@@ -1,111 +1,9 @@
 
				 from __future__ import annotations
			
 
				-import heapq
			
 
				-from contextlib import contextmanager
			
 
				-from typing import Generic, Optional, Dict, Tuple, List, Iterator, TypeVar, Union, NamedTuple
			
 
				+from typing import Optional, Union
			
 
				 
			
 
				-from hivemind.dht.routing import DHTID, DHTExpiration, get_dht_time, BinaryDHTValue, Subkey
			
 
				+from hivemind.dht.routing import DHTID, DHTExpiration, BinaryDHTValue, Subkey
			
 
				 from hivemind.utils.serializer import MSGPackSerializer
			
 
				-
			
 
				-KeyType = TypeVar('KeyType')
			
 
				-ValueType = TypeVar('ValueType')
			
 
				-ROOT = 0
			
 
				-
			
 
				-
			
 
				-class ValueWithExpiration(NamedTuple, Generic[ValueType]):
			
 
				-    value: ValueType
			
 
				-    expiration_time: DHTExpiration
			
 
				-
			
 
				-
			
 
				-class HeapEntry(NamedTuple, Generic[KeyType]):
			
 
				-    expiration_time: DHTExpiration
			
 
				-    key: KeyType
			
 
				-
			
 
				-
			
 
				-class TimedStorage(Generic[KeyType, ValueType]):
			
 
				-    """ A dictionary that maintains up to :maxsize: key-value-expiration tuples until their expiration_time """
			
 
				-    frozen = False  # can be set to True. If true, do not remove outdated elements
			
 
				-
			
 
				-    def __init__(self, maxsize: Optional[int] = None):
			
 
				-        self.maxsize = maxsize or float("inf")
			
 
				-        self.data: Dict[KeyType, ValueWithExpiration[ValueType]] = dict()
			
 
				-        self.expiration_heap: List[HeapEntry[KeyType]] = []
			
 
				-        self.key_to_heap: Dict[KeyType, HeapEntry[KeyType]] = dict()
			
 
				-
			
 
				-    def _remove_outdated(self):
			
 
				-        while not self.frozen and self.expiration_heap and (self.expiration_heap[ROOT].expiration_time < get_dht_time()
			
 
				-                                                            or len(self.expiration_heap) > self.maxsize):
			
 
				-            heap_entry = heapq.heappop(self.expiration_heap)
			
 
				-            if self.key_to_heap.get(heap_entry.key) == heap_entry:
			
 
				-                del self.data[heap_entry.key], self.key_to_heap[heap_entry.key]
			
 
				-
			
 
				-    def store(self, key: KeyType, value: ValueType, expiration_time: DHTExpiration) -> bool:
			
 
				-        """
			
 
				-        Store a (key, value) pair locally at least until expiration_time. See class docstring for details.
			
 
				-        :returns: True if new value was stored, False it was rejected (current value is newer)
			
 
				-        """
			
 
				-        if expiration_time < get_dht_time() and not self.frozen:
			
 
				-            return False
			
 
				-        self.key_to_heap[key] = HeapEntry(expiration_time, key)
			
 
				-        heapq.heappush(self.expiration_heap, self.key_to_heap[key])
			
 
				-        if key in self.data:
			
 
				-            if self.data[key].expiration_time < expiration_time:
			
 
				-                self.data[key] = ValueWithExpiration(value, expiration_time)
			
 
				-                return True
			
 
				-            return False
			
 
				-        self.data[key] = ValueWithExpiration(value, expiration_time)
			
 
				-        self._remove_outdated()
			
 
				-        return True
			
 
				-
			
 
				-    def get(self, key: KeyType) -> Optional[ValueWithExpiration[ValueType]]:
			
 
				-        """ Get a value corresponding to a key if that (key, value) pair was previously stored under this key. """
			
 
				-        self._remove_outdated()
			
 
				-        if key in self.data:
			
 
				-            return self.data[key]
			
 
				-        return None
			
 
				-
			
 
				-    def items(self) -> Iterator[Tuple[KeyType, ValueWithExpiration[ValueType]]]:
			
 
				-        """ Iterate over (key, value, expiration_time) tuples stored in this storage """
			
 
				-        self._remove_outdated()
			
 
				-        return ((key, value_and_expiration) for key, value_and_expiration in self.data.items())
			
 
				-
			
 
				-    def top(self) -> Tuple[Optional[KeyType], Optional[ValueWithExpiration[ValueType]]]:
			
 
				-        """ Return the entry with earliest expiration or None if there isn't any """
			
 
				-        self._remove_outdated()
			
 
				-        if self.data:
			
 
				-            # skip leftover "ghost" entries until first real entry
			
 
				-            while self.key_to_heap.get(self.expiration_heap[ROOT].key) != self.expiration_heap[ROOT]:
			
 
				-                heapq.heappop(self.expiration_heap)
			
 
				-            top_key = self.expiration_heap[ROOT].key
			
 
				-            return top_key, self.data[top_key]
			
 
				-        return None, None
			
 
				-
			
 
				-    def __contains__(self, key: KeyType):
			
 
				-        self._remove_outdated()
			
 
				-        return key in self.data
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        self._remove_outdated()
			
 
				-        return len(self.data)
			
 
				-
			
 
				-    def __delitem__(self, key: KeyType):
			
 
				-        if key in self.key_to_heap:
			
 
				-            del self.data[key], self.key_to_heap[key]
			
 
				-        # note: key may still be in self.expiration_heap, but it will not be used and eventually ._remove_outdated()
			
 
				-
			
 
				-    def __bool__(self):
			
 
				-        return bool(self.data)
			
 
				-
			
 
				-    def __repr__(self):
			
 
				-        return f"{self.__class__.__name__}({self.data})"
			
 
				-
			
 
				-    @contextmanager
			
 
				-    def freeze(self):
			
 
				-        """ Temporarily cease to ._remove_outdated() elements inside this context to ensure consistency """
			
 
				-        prev_frozen, self.frozen = self.frozen, True
			
 
				-        try:
			
 
				-            yield self
			
 
				-        finally:
			
 
				-            self.frozen = prev_frozen
			
 
				+from hivemind.utils.timed_storage import KeyType, ValueType, TimedStorage
			
 
				 
			
 
				 
			
 
				 @MSGPackSerializer.ext_serializable(0x50)
			
--- a/hivemind/utils/__init__.py
+++ b/hivemind/utils/__init__.py
@@ -5,4 +5,5 @@ from hivemind.utils.serializer import *
 
				 from hivemind.utils.mpfuture import *
			
 
				 from hivemind.utils.threading import *
			
 
				 from hivemind.utils.grpc import *
			
 
				+from hivemind.utils.timed_storage import *
			
 
				 from hivemind.utils.logging import get_logger
			
--- a/hivemind/utils/grpc.py
+++ b/hivemind/utils/grpc.py
@@ -1,17 +1,148 @@
 
				 """
			
 
				 Utilities for running GRPC services: compile protobuf, patch legacy versions, etc
			
 
				 """
			
 
				+from __future__ import annotations
			
 
				+import os
			
 
				+import threading
			
 
				+from typing import NamedTuple, Sequence, Tuple, Optional, Union, Any, Dict, TypeVar, Type
			
 
				 
			
 
				+import grpc
			
 
				 import numpy as np
			
 
				 import torch
			
 
				 
			
 
				 from hivemind.proto import runtime_pb2
			
 
				 from hivemind.proto.runtime_pb2 import CompressionType
			
 
				+from hivemind.utils.timed_storage import TimedStorage, get_dht_time, DHTExpiration, ValueWithExpiration
			
 
				+from hivemind.utils.networking import Endpoint
			
 
				+from hivemind.utils.logging import get_logger
			
 
				+
			
 
				+logger = get_logger(__file__)
			
 
				+
			
 
				+Stub = TypeVar("Stub")
			
 
				+
			
 
				+
			
 
				+class ChannelInfo(NamedTuple):
			
 
				+    target: Endpoint
			
 
				+    aio: bool
			
 
				+    options: Tuple[Tuple[str, str], ...]
			
 
				+    credentials: Optional[grpc.ChannelCredentials]
			
 
				+    compression: Optional[grpc.Compression]
			
 
				+
			
 
				+
			
 
				+class ChannelCache(TimedStorage[ChannelInfo, Tuple[Union[grpc.Channel, grpc.aio.Channel], Dict]]):
			
 
				+    """
			
 
				+    A process-wide cache of gRPC channels, supports both normal and aio channels, secure/insecure channels, etc
			
 
				+    Based on grpcio internal channel cache by Richard Belleville and Lidi Zheng (thanks!)
			
 
				+    Unlike TimedStorage, ChannelCache actively evicts stale channels even if the cache is not accessed
			
 
				+    Unlike grpc._simple_stubs.ChannelCache, this implementation supports aio and does not forcibly close active channels
			
 
				+    """
			
 
				+    MAXIMUM_CHANNELS = os.environ.get("GRPC_PYTHON_MANAGED_CHANNEL_MAXIMUM", 4096)
			
 
				+    EVICTION_PERIOD_SECONDS = os.environ.get("GRPC_PYTHON_MANAGED_CHANNEL_EVICTION_SECONDS", 10 * 60)
			
 
				+    logger.debug(f"Eviction period = {EVICTION_PERIOD_SECONDS}s, max channels = {MAXIMUM_CHANNELS}")
			
 
				+
			
 
				+    _singleton: Optional[ChannelCache] = None
			
 
				+    _singleton_pid: int = os.getpid()
			
 
				+    _lock: threading.RLock = threading.RLock()
			
 
				+    _update_eviction_evt: threading.Event = threading.Event()
			
 
				+
			
 
				+    def __init__(self, _created_as_singleton=False):
			
 
				+        assert _created_as_singleton, f"Please use {self.__class__.__name__}.get_singleton()"
			
 
				+        super().__init__(maxsize=self.MAXIMUM_CHANNELS)
			
 
				+        self._is_active = True
			
 
				+        self._nearest_expiration_time = float('inf')
			
 
				+        self._eviction_thread = threading.Thread(target=self._evict_stale_channels_in_background, daemon=True)
			
 
				+        self._eviction_thread.start()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_singleton(cls):
			
 
				+        """ Get or create the channel cache for the current process """
			
 
				+        with cls._lock:
			
 
				+            if cls._singleton is None or cls._singleton_pid != os.getpid():
			
 
				+                if cls._singleton is not None:
			
 
				+                    cls._singleton._stop_background_thread()
			
 
				+                cls._singleton, cls._singleton_pid = cls(_created_as_singleton=True), os.getpid()
			
 
				+            return cls._singleton
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_stub(cls, target: Endpoint, stub_type: Type[Stub], *, aio: bool, options: Sequence[Tuple[str, Any]] = (),
			
 
				+                 channel_credentials: Optional[grpc.ChannelCredentials] = None,
			
 
				+                 compression: Optional[grpc.Compression] = None) -> Stub:
			
 
				+        """
			
 
				+        Create a grpc channel with given options or reuse pre-existing one
			
 
				+
			
 
				+        :param target: the recipient's address and port
			
 
				+        :param stub_type: a gRPC stub (client) to be instantiated
			
 
				+        :param aio: if True, returns grpc.Channel, otherwise returns grpc.aio.Channel
			
 
				+        :param options: see https://grpc.github.io/grpc/core/group__grpc__arg__keys.html
			
 
				+        :param channel_credentials: if specified, create a secure channel usin these credentials (default = insecure)
			
 
				+        :param compression: see https://github.com/grpc/grpc/tree/master/examples/python/compression
			
 
				+        """
			
 
				+        cache = cls.get_singleton()
			
 
				+        with cls._lock:
			
 
				+            key = ChannelInfo(target, aio, tuple(options or ()), channel_credentials, compression)
			
 
				+            entry: ValueWithExpiration = super(cls, cache).get(key)
			
 
				+            channel, stubs = entry.value if entry is not None else (cls._create_channel(*key), {})
			
 
				+            if stub_type not in stubs:
			
 
				+                stubs[stub_type] = stub_type(channel)
			
 
				+
			
 
				+            # either cache channel or update expiration of an existing channel
			
 
				+            expiration_time = get_dht_time() + cls.EVICTION_PERIOD_SECONDS
			
 
				+            super(cls, cache).store(key, (channel, stubs), expiration_time)
			
 
				+
			
 
				+            if expiration_time < cache._nearest_expiration_time:
			
 
				+                cache._nearest_expiration_time = expiration_time
			
 
				+                cls._update_eviction_evt.set()
			
 
				+
			
 
				+            return stubs[stub_type]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _create_channel(cls, target: Endpoint, aio: bool, options: Sequence[Tuple[str, Any], ...],
			
 
				+                        channel_credentials: Optional[grpc.ChannelCredentials],
			
 
				+                        compression: Optional[grpc.Compression]) -> Union[grpc.Channel, grpc.aio.Channel]:
			
 
				+        namespace = grpc.aio if aio else grpc
			
 
				+        if channel_credentials is None:
			
 
				+            logger.debug(f"Creating insecure {namespace} channel with options '{options}' "
			
 
				+                         f"and compression '{compression}'")
			
 
				+            return namespace.insecure_channel(target, options=options, compression=compression)
			
 
				+        else:
			
 
				+            logger.debug(f"Creating secure {namespace} channel with credentials '{channel_credentials}', "
			
 
				+                         f"options '{options}' and compression '{compression}'")
			
 
				+            return namespace.secure_channel(target, credentials=channel_credentials,
			
 
				+                                            options=options, compression=compression)
			
 
				+
			
 
				+    def _evict_stale_channels_in_background(self):
			
 
				+        while self._is_active:
			
 
				+            now = get_dht_time()
			
 
				+            time_to_wait = max(0.0, self._nearest_expiration_time - now)
			
 
				+            interrupted_early = self._update_eviction_evt.wait(time_to_wait if time_to_wait != float('inf') else None)
			
 
				+            if interrupted_early:
			
 
				+                self._update_eviction_evt.clear()
			
 
				+                continue
			
 
				+
			
 
				+            with self._lock:
			
 
				+                self._remove_outdated()
			
 
				+                _, entry = super().top()
			
 
				+                self._nearest_expiration_time = entry.expiration_time if entry is not None else float('inf')
			
 
				+
			
 
				+    def _stop_background_thread(self):
			
 
				+        with self._lock:
			
 
				+            self._is_active = False
			
 
				+            self._update_eviction_evt.set()
			
 
				+
			
 
				+    def store(self, *args, **kwargs) -> ValueError:
			
 
				+        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
			
 
				+
			
 
				+    def get(self, *args, **kwargs) -> ValueError:
			
 
				+        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
			
 
				+
			
 
				+    def top(self) -> ValueError:
			
 
				+        raise ValueError(f"Please use {self.__class__.__name__}.get_stub to get or create stubs")
			
 
				+
			
 
				 
			
 
				 FP16_MAX = 65_504
			
 
				 
			
 
				 
			
 
				-def serialize_torch_tensor(tensor: torch.Tensor, compression_type=CompressionType.NONE, 
			
 
				+def serialize_torch_tensor(tensor: torch.Tensor, compression_type=CompressionType.NONE,
			
 
				                            allow_inplace=False) -> runtime_pb2.Tensor:
			
 
				     if compression_type == CompressionType.MEANSTD_LAST_AXIS_FLOAT16:
			
 
				         assert tensor.dtype == torch.float32
			
--- a/hivemind/utils/timed_storage.py
+++ b/hivemind/utils/timed_storage.py
@@ -0,0 +1,109 @@
 
				+""" A dictionary-like storage that stores items until a specified expiration time or up to a limited size """
			
 
				+from __future__ import annotations
			
 
				+import heapq
			
 
				+import time
			
 
				+from contextlib import contextmanager
			
 
				+from typing import TypeVar, NamedTuple, Generic, Optional, Dict, List, Iterator, Tuple
			
 
				+
			
 
				+KeyType = TypeVar('KeyType')
			
 
				+ValueType = TypeVar('ValueType')
			
 
				+get_dht_time = time.time  # a global (weakly synchronized) time
			
 
				+DHTExpiration = float
			
 
				+ROOT = 0
			
 
				+
			
 
				+
			
 
				+class ValueWithExpiration(NamedTuple, Generic[ValueType]):
			
 
				+    value: ValueType
			
 
				+    expiration_time: DHTExpiration
			
 
				+
			
 
				+
			
 
				+class HeapEntry(NamedTuple, Generic[KeyType]):
			
 
				+    expiration_time: DHTExpiration
			
 
				+    key: KeyType
			
 
				+
			
 
				+
			
 
				+class TimedStorage(Generic[KeyType, ValueType]):
			
 
				+    """ A dictionary that maintains up to :maxsize: key-value-expiration tuples until their expiration_time """
			
 
				+    frozen = False  # can be set to True. If true, do not remove outdated elements
			
 
				+
			
 
				+    def __init__(self, maxsize: Optional[int] = None):
			
 
				+        self.maxsize = maxsize or float("inf")
			
 
				+        self.data: Dict[KeyType, ValueWithExpiration[ValueType]] = dict()
			
 
				+        self.expiration_heap: List[HeapEntry[KeyType]] = []
			
 
				+        self.key_to_heap: Dict[KeyType, HeapEntry[KeyType]] = dict()
			
 
				+
			
 
				+    def _remove_outdated(self):
			
 
				+        while not self.frozen and self.expiration_heap and (self.expiration_heap[ROOT].expiration_time < get_dht_time()
			
 
				+                                                            or len(self.data) > self.maxsize):
			
 
				+            heap_entry = heapq.heappop(self.expiration_heap)
			
 
				+            if self.key_to_heap.get(heap_entry.key) == heap_entry:
			
 
				+                del self.data[heap_entry.key], self.key_to_heap[heap_entry.key]
			
 
				+
			
 
				+    def store(self, key: KeyType, value: ValueType, expiration_time: DHTExpiration) -> bool:
			
 
				+        """
			
 
				+        Store a (key, value) pair locally at least until expiration_time. See class docstring for details.
			
 
				+        :returns: True if new value was stored, False it was rejected (current value is newer)
			
 
				+        """
			
 
				+        if expiration_time < get_dht_time() and not self.frozen:
			
 
				+            return False
			
 
				+        self.key_to_heap[key] = HeapEntry(expiration_time, key)
			
 
				+        heapq.heappush(self.expiration_heap, self.key_to_heap[key])
			
 
				+        if key in self.data:
			
 
				+            if self.data[key].expiration_time < expiration_time:
			
 
				+                self.data[key] = ValueWithExpiration(value, expiration_time)
			
 
				+                return True
			
 
				+            return False
			
 
				+        self.data[key] = ValueWithExpiration(value, expiration_time)
			
 
				+        self._remove_outdated()
			
 
				+        return True
			
 
				+
			
 
				+    def get(self, key: KeyType) -> Optional[ValueWithExpiration[ValueType]]:
			
 
				+        """ Get a value corresponding to a key if that (key, value) pair was previously stored under this key. """
			
 
				+        self._remove_outdated()
			
 
				+        if key in self.data:
			
 
				+            return self.data[key]
			
 
				+        return None
			
 
				+
			
 
				+    def items(self) -> Iterator[Tuple[KeyType, ValueWithExpiration[ValueType]]]:
			
 
				+        """ Iterate over (key, value, expiration_time) tuples stored in this storage """
			
 
				+        self._remove_outdated()
			
 
				+        return ((key, value_and_expiration) for key, value_and_expiration in self.data.items())
			
 
				+
			
 
				+    def top(self) -> Tuple[Optional[KeyType], Optional[ValueWithExpiration[ValueType]]]:
			
 
				+        """ Return the entry with earliest expiration or None if there isn't any """
			
 
				+        self._remove_outdated()
			
 
				+        if self.data:
			
 
				+            # skip leftover "ghost" entries until first real entry
			
 
				+            while self.key_to_heap.get(self.expiration_heap[ROOT].key) != self.expiration_heap[ROOT]:
			
 
				+                heapq.heappop(self.expiration_heap)
			
 
				+            top_key = self.expiration_heap[ROOT].key
			
 
				+            return top_key, self.data[top_key]
			
 
				+        return None, None
			
 
				+
			
 
				+    def __contains__(self, key: KeyType):
			
 
				+        self._remove_outdated()
			
 
				+        return key in self.data
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        self._remove_outdated()
			
 
				+        return len(self.data)
			
 
				+
			
 
				+    def __delitem__(self, key: KeyType):
			
 
				+        if key in self.key_to_heap:
			
 
				+            del self.data[key], self.key_to_heap[key]
			
 
				+        # note: key may still be in self.expiration_heap, but it will not be used and eventually ._remove_outdated()
			
 
				+
			
 
				+    def __bool__(self):
			
 
				+        return bool(self.data)
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return f"{self.__class__.__name__}({self.data})"
			
 
				+
			
 
				+    @contextmanager
			
 
				+    def freeze(self):
			
 
				+        """ Temporarily cease to ._remove_outdated() elements inside this context to ensure consistency """
			
 
				+        prev_frozen, self.frozen = self.frozen, True
			
 
				+        try:
			
 
				+            yield self
			
 
				+        finally:
			
 
				+            self.frozen = prev_frozen
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ sortedcontainers
 
				 uvloop>=0.14.0
			
 
				 grpcio>=1.33.2
			
 
				 grpcio-tools>=1.33.2
			
 
				+protobuf>=3.12.2
			
 
				 configargparse>=1.2.3
			
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -91,7 +91,8 @@ async def test_allreduce_protocol():
 
				     ]
			
 
				 
			
 
				     assert len(averaged_tensors) == len(reference_tensors)
			
 
				-    assert all(map(torch.allclose, averaged_tensors, reference_tensors))
			
 
				+    assert all(torch.allclose(our, ref, atol=1e-6, rtol=0)
			
 
				+               for our, ref in zip(averaged_tensors, reference_tensors))
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
--- a/tests/test_dht_storage.py
+++ b/tests/test_dht_storage.py
@@ -37,11 +37,14 @@ def test_change_expiration_time():
 
				 
			
 
				 
			
 
				 def test_maxsize_cache():
			
 
				-    d = DHTLocalStorage(maxsize=1)
			
 
				-    d.store(DHTID.generate("key1"), b"val1", get_dht_time() + 1)
			
 
				+    d = DHTLocalStorage(maxsize=2)
			
 
				+    d.store(DHTID.generate("key1a"), b"val1a", get_dht_time() + 1)
			
 
				+    d.store(DHTID.generate("key1b"), b"val1b", get_dht_time() + 1)
			
 
				+    d.store(DHTID.generate("key1a"), b"val1a2", get_dht_time() + 2)
			
 
				     d.store(DHTID.generate("key2"), b"val2", get_dht_time() + 200)
			
 
				     assert d.get(DHTID.generate("key2"))[0] == b"val2", "Value with bigger exp. time must be kept"
			
 
				-    assert d.get(DHTID.generate("key1")) is None, "Value with less exp time, must be deleted"
			
 
				+    assert d.get(DHTID.generate("key1a"))[0] == b"val1a2", "Value with bigger exp. time must be kept"
			
 
				+    assert d.get(DHTID.generate("key1b")) is None, "Value with less exp time, must be deleted"
			
 
				 
			
 
				 
			
 
				 def test_localstorage_top():
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -30,7 +30,6 @@ def test_call_many():
 
				     backward_k_min = 1
			
 
				     forward_timeout = None
			
 
				     backward_timeout = None
			
 
				-    rtol = 1e-3
			
 
				     atol = 1e-5
			
 
				 
			
 
				     with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=64,
			
@@ -61,7 +60,7 @@ def test_call_many():
 
				         reference_outputs[2, 0] = e1(inputs_clone[2:3])
			
 
				         reference_outputs[2, 2] = e3(inputs_clone[2:3])
			
 
				 
			
 
				-        assert torch.allclose(expert_outputs, reference_outputs, rtol, atol)
			
 
				+        assert torch.allclose(expert_outputs, reference_outputs, atol=atol, rtol=0)
			
 
				         proj = torch.randn(4, 64)
			
 
				         loss = (expert_outputs[(0, 1, 1, 2), (0, 2, 1, 0)] * proj).sum()
			
 
				         loss.backward()
			
@@ -70,7 +69,7 @@ def test_call_many():
 
				         reference_loss = (reference_outputs[(0, 1, 1, 2), (0, 2, 1, 0)] * proj).sum()
			
 
				         reference_loss.backward()
			
 
				         reference_grad = inputs_clone.grad.data.cpu().clone()
			
 
				-        assert torch.allclose(our_grad, reference_grad, rtol, atol)
			
 
				+        assert torch.allclose(our_grad, reference_grad, atol=atol, rtol=0)
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
@@ -125,7 +124,6 @@ def test_beam_search_correctness():
 
				 
			
 
				 @pytest.mark.forked
			
 
				 def test_determinism():
			
 
				-    rtol = 0
			
 
				     atol = 1e-5
			
 
				 
			
 
				     xx = torch.randn(32, 1024, requires_grad=True)
			
@@ -141,8 +139,8 @@ def test_determinism():
 
				         grad, = torch.autograd.grad(out.sum(), xx, retain_graph=True)
			
 
				         grad_rerun, = torch.autograd.grad(out_rerun.sum(), xx, retain_graph=True)
			
 
				 
			
 
				-    assert torch.allclose(out, out_rerun, rtol, atol), "Dropout layer outputs are non-deterministic."
			
 
				-    assert torch.allclose(grad, grad_rerun, rtol, atol), "Gradients are non-deterministic."
			
 
				+    assert torch.allclose(out, out_rerun, atol=atol, rtol=0), "Dropout layer outputs are non-deterministic."
			
 
				+    assert torch.allclose(grad, grad_rerun, atol=atol, rtol=0), "Gradients are non-deterministic."
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
--- a/tests/test_util_modules.py
+++ b/tests/test_util_modules.py
@@ -3,7 +3,8 @@ import torch
 
				 
			
 
				 import pytest
			
 
				 import hivemind
			
 
				-
			
 
				+from hivemind.proto.dht_pb2_grpc import DHTStub
			
 
				+from hivemind.proto.runtime_pb2_grpc import ConnectionHandlerStub
			
 
				 from concurrent.futures import CancelledError
			
 
				 
			
 
				 
			
@@ -129,3 +130,39 @@ def test_vector_compression(size=(128, 128, 64), alpha=5e-08):
 
				     error = deserialize_torch_tensor(serialize_torch_tensor(X, CompressionType.FLOAT16)) - X
			
 
				     assert error.square().mean() < alpha
			
 
				 
			
 
				+
			
 
				+@pytest.mark.forked
			
 
				+@pytest.mark.asyncio
			
 
				+async def test_channel_cache():
			
 
				+    hivemind.ChannelCache.MAXIMUM_CHANNELS = 3
			
 
				+    hivemind.ChannelCache.EVICTION_PERIOD_SECONDS = 0.1
			
 
				+
			
 
				+    c1 = hivemind.ChannelCache.get_stub('localhost:1337', DHTStub, aio=False)
			
 
				+    c2 = hivemind.ChannelCache.get_stub('localhost:1337', DHTStub, aio=True)
			
 
				+    c3 = hivemind.ChannelCache.get_stub('localhost:1338', DHTStub, aio=False)
			
 
				+    c3_again = hivemind.ChannelCache.get_stub('localhost:1338', DHTStub, aio=False)
			
 
				+    c1_again = hivemind.ChannelCache.get_stub('localhost:1337', DHTStub, aio=False)
			
 
				+    c4 = hivemind.ChannelCache.get_stub('localhost:1339', DHTStub, aio=True)
			
 
				+    c2_anew = hivemind.ChannelCache.get_stub('localhost:1337', DHTStub, aio=True)
			
 
				+    c1_yetagain = hivemind.ChannelCache.get_stub('localhost:1337', DHTStub, aio=False)
			
 
				+
			
 
				+    await asyncio.sleep(0.2)
			
 
				+    c1_anew = hivemind.ChannelCache.get_stub(target='localhost:1337', aio=False, stub_type=DHTStub)
			
 
				+    c1_anew_again = hivemind.ChannelCache.get_stub(target='localhost:1337', aio=False, stub_type=DHTStub)
			
 
				+    c1_otherstub = hivemind.ChannelCache.get_stub(target='localhost:1337', aio=False, stub_type=ConnectionHandlerStub)
			
 
				+    await asyncio.sleep(0.05)
			
 
				+    c1_otherstub_again = hivemind.ChannelCache.get_stub(target='localhost:1337', aio=False,
			
 
				+                                                        stub_type=ConnectionHandlerStub)
			
 
				+    all_channels = [c1, c2, c3, c4, c3_again, c1_again, c2_anew, c1_yetagain, c1_anew, c1_anew_again, c1_otherstub]
			
 
				+
			
 
				+    assert all(isinstance(c, DHTStub) for c in all_channels[:-1])
			
 
				+    assert isinstance(all_channels[-1], ConnectionHandlerStub)
			
 
				+    assert 'aio' in repr(c2.rpc_find)
			
 
				+    assert 'aio' not in repr(c1.rpc_find)
			
 
				+
			
 
				+    duplicates = {(c1, c1_again), (c1, c1_yetagain), (c1_again, c1_yetagain), (c3, c3_again),
			
 
				+                  (c1_anew, c1_anew_again), (c1_otherstub, c1_otherstub_again)}
			
 
				+    for i in range(len(all_channels)):
			
 
				+        for j in range(i + 1, len(all_channels)):
			
 
				+            ci, cj = all_channels[i], all_channels[j]
			
 
				+            assert (ci is cj) == ((ci, cj) in duplicates), (i, j)