vor 2 Jahren · 62d9ed5ce7
--- a/setup.cfg
+++ b/setup.cfg
@@ -48,6 +48,7 @@ install_requires =
 
															     sentencepiece>=0.1.99
														
 
															     peft@git+https://github.com/huggingface/peft@5884bdbea49e5e71e2cd06ecfa484bb635063735
														
 
															     safetensors>=0.3.1
														
 
															+    Dijkstar>=2.6.0
														
 
															 [options.extras_require]
														
 
															 dev =
														
--- a/src/petals/__init__.py
+++ b/src/petals/__init__.py
@@ -11,7 +11,7 @@ from petals.models import *
 
															 from petals.utils import *
														
 
															 from petals.utils.logging import initialize_logs as _initialize_logs
														
 
															-__version__ = "1.2.0.dev2"
														
 
															+__version__ = "1.2.0.dev3"
														
 
															 if not os.getenv("PETALS_IGNORE_DEPENDENCY_VERSION"):
														
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@@ -84,7 +84,7 @@ def main():
 
															     parser.add_argument('--attn_cache_tokens', type=int, default=8192,
														
 
															                         help='The number of past attention key/value pairs that will be stored between inference steps. '
														
 
															                              'Default: 8192 (4 simultaneous sessions of up to 2048 tokens).')
														
 
															-    parser.add_argument('--alloc_timeout', type=float, default=60,
														
 
															+    parser.add_argument('--alloc_timeout', type=float, default=5,
														
 
															                         help='If the cache is full, the server will wait for this number of seconds hoping that some memory will be freed '
														
 
															                              'before rejecting the request')
														
 
															     parser.add_argument('--revision', type=str, default=None,
														
--- a/src/petals/client/inference_session.py
+++ b/src/petals/client/inference_session.py
@@ -340,7 +340,9 @@ class InferenceSession:
 
															                 f"from block {block_idx} to {update_end} will be regenerated"
														
 
															             )
														
 
															-        updated_spans = self._sequence_manager.make_sequence(block_idx, update_end, mode="min_latency")
														
 
															+        updated_spans = self._sequence_manager.make_sequence(
														
 
															+            block_idx, update_end, mode="min_latency", cache_tokens_needed=self._max_length
														
 
															+        )
														
 
															         # make_sequence() could return a longer sequence
														
 
															         updated_spans[-1].end = min(updated_spans[-1].end, update_end)
														
 
															         updated_sessions = self._enter_server_sessions(updated_spans)
														
--- a/src/petals/client/routing/sequence_manager.py
+++ b/src/petals/client/routing/sequence_manager.py
@@ -10,6 +10,7 @@ import time
 
															 from typing import Any, Collection, Dict, List, Optional, Sequence, Union
														
 
															 from weakref import WeakMethod
														
 
															+import dijkstar
														
 
															 import numpy as np
														
 
															 from hivemind import DHT, P2P, MSGPackSerializer, PeerID
														
 
															 from hivemind.dht.node import Blacklist
														
@@ -23,6 +24,8 @@ from petals.client.routing.spending_policy import NoSpendingPolicy
 
															 from petals.constants import PUBLIC_INITIAL_PEERS
														
 
															 from petals.data_structures import ModuleUID, RemoteSpanInfo, ServerState
														
 
															 from petals.server.handler import TransformerConnectionHandler
														
 
															+from petals.utils.ping import PingAggregator
														
 
															+from petals.utils.random import sample_up_to
														
 
															 logger = get_logger(__name__)
														
@@ -33,6 +36,7 @@ class SequenceManagerConfig:
 
															     dht_prefix: Optional[str] = None  # a prefix for all dht keys that correspond to this model (default: model name)
														
 
															     daemon_startup_timeout: int = 60  # timeout for the libp2p daemon connecting to initial peers
														
 
															+    show_route: Union[str, bool] = "inference"  # show chosen route through servers. one of [False, "inference", True]
														
 
															     allowed_servers: Optional[Collection[Union[PeerID, str]]] = None  # if defined, send requests only to these servers
														
 
															     use_server_to_server: bool = True  # Use direct server-to-server communication
														
@@ -43,7 +47,10 @@ class SequenceManagerConfig:
 
															     min_backoff: float = 1  # after a repeated failure, sleep for this many seconds times 2 ** (num_failures - 1)
														
 
															     max_backoff: float = 60  # limit maximal sleep time between retries to this value
														
 
															     ban_timeout: float = 15  # when a remote peer fails to respond, prevent routing to that peer for this many seconds
														
 
															-    active_adapter: Optional[str] = None
														
 
															+    active_adapter: Optional[str] = None  # name of active LoRA adapter (usually, Hugging Face repo)
														
 
															+
														
 
															+    max_pinged: int = 5  # max servers to ping from each sequence side, per update
														
 
															+    ping_timeout: float = 2  # max time to wait for pings, per update
														
 
															 @dataclasses.dataclass
														
@@ -79,7 +86,6 @@ class RemoteSequenceManager:
 
															         *,
														
 
															         dht: Optional[DHT] = None,
														
 
															         state: Optional[SequenceManagerState] = None,
														
 
															-        active_adapter: Optional[str] = None,
														
 
															     ):
														
 
															         assert config.initial_peers or dht is not None, "Please specify `config.initial_peers` or `dht`"
														
 
															         assert config.dht_prefix, "Could not find dht_prefix in config, please create model with dht_prefix=..."
														
@@ -94,7 +100,7 @@ class RemoteSequenceManager:
 
															             dht = DHT(
														
 
															                 initial_peers=config.initial_peers,
														
 
															                 client_mode=True,
														
 
															-                num_workers=config.num_hidden_layers,
														
 
															+                num_workers=32,
														
 
															                 startup_timeout=config.daemon_startup_timeout,
														
 
															                 start=True,
														
 
															             )
														
@@ -109,25 +115,25 @@ class RemoteSequenceManager:
 
															         self._thread_start_lock = threading.Lock()
														
 
															         self.policy = NoSpendingPolicy()
														
 
															+        self.ping_aggregator = PingAggregator(dht)
														
 
															+
														
 
															         if state.banned_peers is None:
														
 
															             state.banned_peers = Blacklist(base_time=config.ban_timeout, backoff_rate=2.0)
														
 
															         if state.sequence_info is None:
														
 
															             state.sequence_info = RemoteSequenceInfo.make_empty(block_uids)
														
 
															-        if state.sequence_info.last_updated_time is None:
														
 
															-            # Pre-fetch module infos in DHT in parallel with .from_pretrained(), then use cached records
														
 
															-            # in the first _update() instead of the latest ones. This makes the first .update() faster.
														
 
															-            petals.dht_utils.get_remote_module_infos(
														
 
															-                self.dht, self.block_uids, active_adapter=active_adapter, latest=True, return_future=True
														
 
															-            )
														
 
															-            self._need_latest_infos = False
														
 
															-        else:
														
 
															+        if state.sequence_info.last_updated_time is not None:
														
 
															             assert block_uids == state.sequence_info.block_uids
														
 
															             self._thread.ready.set()  # no need to await the first dht fetch
														
 
															             self._need_latest_infos = True
														
 
															     def make_sequence(
														
 
															-        self, start_index: int = 0, end_index: Optional[int] = None, *, mode: str
														
 
															+        self,
														
 
															+        start_index: int = 0,
														
 
															+        end_index: Optional[int] = None,
														
 
															+        *,
														
 
															+        mode: str,
														
 
															+        cache_tokens_needed: Optional[int] = None,
														
 
															     ) -> List[RemoteSpanInfo]:
														
 
															         """
														
 
															         Form a sequence of remote servers that collectively serve all consecutive layers
														
@@ -143,6 +149,150 @@ class RemoteSequenceManager:
 
															             self.update(wait=True)  # this will await an existing update or trigger a new one (if not updating)
														
 
															         end_index = end_index if end_index is not None else len(self)
														
 
															+
														
 
															+        if mode == "min_latency":
														
 
															+            span_sequence = self._make_sequence_with_min_latency(
														
 
															+                start_index, end_index, cache_tokens_needed=cache_tokens_needed
														
 
															+            )
														
 
															+        elif mode == "max_throughput":
														
 
															+            span_sequence = self._make_sequence_with_max_throughput(start_index, end_index)
														
 
															+        else:
														
 
															+            raise RuntimeError(f"Unexpected mode {mode}")
														
 
															+
														
 
															+        if self.config.show_route is True or (mode == "min_latency" and self.config.show_route == "inference"):
														
 
															+            route_repr = " => ".join(
														
 
															+                [f"{span.start}:{span.end} via …{str(span.peer_id)[-6:]}" for span in span_sequence]
														
 
															+            )
														
 
															+            logger.info(f"Route found: {route_repr}")
														
 
															+        return span_sequence
														
 
															+
														
 
															+    def _make_sequence_with_min_latency(
														
 
															+        self, start_index: int, end_index: int, *, cache_tokens_needed: Optional[int]
														
 
															+    ) -> List[RemoteSpanInfo]:
														
 
															+        if start_index == end_index:
														
 
															+            return []
														
 
															+
														
 
															+        with self.lock_changes:
														
 
															+            missing_blocks = [
														
 
															+                block_idx
														
 
															+                for block_idx in range(start_index, end_index)
														
 
															+                if not self.state.sequence_info.spans_containing_block[block_idx]
														
 
															+            ]
														
 
															+            if missing_blocks:
														
 
															+                raise MissingBlocksError(missing_blocks)
														
 
															+            server_infos = {
														
 
															+                span.peer_id: span.server_info
														
 
															+                for block_idx in range(start_index, end_index)
														
 
															+                for span in self.state.sequence_info.spans_containing_block[block_idx]
														
 
															+            }
														
 
															+
														
 
															+            graph = self._build_inference_graph(start_index, end_index, cache_tokens_needed=cache_tokens_needed)
														
 
															+
														
 
															+        path = dijkstar.find_path(graph, "start", "end")
														
 
															+        logger.debug(f"Path info: {path}")
														
 
															+        if start_index == 0 and end_index == len(self):
														
 
															+            logger.debug(f"Expected speed: {1 / path.total_cost:.1f} steps/sec")
														
 
															+
														
 
															+        span_sequence = []
														
 
															+        for peer_id, block_idx in path.nodes[1:-1]:
														
 
															+            if not span_sequence or span_sequence[-1].peer_id != peer_id:
														
 
															+                span_sequence.append(RemoteSpanInfo(peer_id, block_idx, block_idx, server_infos[peer_id]))
														
 
															+            else:
														
 
															+                span_sequence[-1].end = block_idx
														
 
															+
														
 
															+        # Remove empty spans that can appear if we don't force to go to the end of each server and network delay
														
 
															+        # don't follow triangle inequality (delay(A, B) + delay(B, C) < delay(A, C)) due to measurement errors
														
 
															+        span_sequence = [span for span in span_sequence if span.length > 0]
														
 
															+
														
 
															+        return span_sequence
														
 
															+
														
 
															+    def _build_inference_graph(
														
 
															+        self,
														
 
															+        start_index: int,
														
 
															+        end_index: int,
														
 
															+        *,
														
 
															+        cache_tokens_needed: Optional[int],
														
 
															+        overhead_coeff: float = 1.82,  # Backend overhead (empirically measured)
														
 
															+        overhead_delay: float = 0.018,  # Serialization overhead (empirically measured)
														
 
															+        default_inference_rps: float = 300,  # If inference RPS unknown
														
 
															+        alloc_delay: float = 10,  # If not enough cache left, we penalize the edge
														
 
															+    ) -> dijkstar.Graph:
														
 
															+        missing_blocks = [
														
 
															+            block_idx
														
 
															+            for block_idx in range(start_index, end_index)
														
 
															+            if not self.state.sequence_info.spans_containing_block[block_idx]
														
 
															+        ]
														
 
															+        if missing_blocks:
														
 
															+            raise MissingBlocksError(missing_blocks)
														
 
															+
														
 
															+        client_server_rtts = self.ping_aggregator.to_dict()
														
 
															+
														
 
															+        graph = dijkstar.Graph()
														
 
															+
														
 
															+        # Clent -> server network delays
														
 
															+        for span in self.state.sequence_info.spans_containing_block[start_index]:
														
 
															+            delay = self._rtt_to_delay(client_server_rtts.get(span.peer_id))
														
 
															+            delay += overhead_delay
														
 
															+            if not self._has_cache_for(span, cache_tokens_needed):
														
 
															+                delay += alloc_delay
														
 
															+            graph.add_edge("start", (span.peer_id, start_index), delay)
														
 
															+
														
 
															+        # Server -> client network delays
														
 
															+        for span in self.state.sequence_info.spans_containing_block[end_index - 1]:
														
 
															+            delay = self._rtt_to_delay(client_server_rtts.get(span.peer_id))
														
 
															+            graph.add_edge((span.peer_id, end_index), "end", delay)
														
 
															+
														
 
															+        # Server -> server network delays
														
 
															+        for block_idx in range(start_index + 1, end_index):
														
 
															+            for cur_span in self.state.sequence_info.spans_containing_block[block_idx - 1]:
														
 
															+                if cur_span.end != block_idx:
														
 
															+                    # If we choose a server, we force to go to the end of it before switching to a new one
														
 
															+                    # to avoid O(N^2) graphs for N servers
														
 
															+                    continue
														
 
															+
														
 
															+                for next_span in self.state.sequence_info.spans_containing_block[block_idx]:
														
 
															+                    rtt = None
														
 
															+                    if cur_span.server_info.next_pings is not None:
														
 
															+                        rtt = cur_span.server_info.next_pings.get(next_span.peer_id.to_base58())
														
 
															+                    delay = self._rtt_to_delay(rtt)
														
 
															+                    delay += overhead_delay
														
 
															+                    if not self._has_cache_for(next_span, cache_tokens_needed):
														
 
															+                        delay += alloc_delay
														
 
															+                    graph.add_edge((cur_span.peer_id, block_idx), (next_span.peer_id, block_idx), delay)
														
 
															+
														
 
															+        # Compute delays
														
 
															+        for span in self.state.sequence_info.spans_by_priority:
														
 
															+            for block_idx in range(max(span.start, start_index), min(span.end, end_index)):
														
 
															+                inference_rps = span.server_info.inference_rps
														
 
															+                if inference_rps is None:
														
 
															+                    inference_rps = default_inference_rps
														
 
															+                graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), overhead_coeff / inference_rps)
														
 
															+
														
 
															+        return graph
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _rtt_to_delay(
														
 
															+        rtt: float,
														
 
															+        *,
														
 
															+        default_delay: float = 0.15,  # If network delay unknown
														
 
															+        max_delay: float = 5,  # If unreachable, we don't want to discard the edge completely
														
 
															+    ) -> float:
														
 
															+        if rtt is None:
														
 
															+            return default_delay
														
 
															+        return min(rtt / 2, max_delay)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _has_cache_for(span: RemoteSpanInfo, cache_tokens_needed: Optional[int] = None) -> bool:
														
 
															+        if cache_tokens_needed is None or span.server_info.cache_tokens_left is None:
														
 
															+            return True
														
 
															+
														
 
															+        # Here, `span` contains all blocks hosted by a server - but we won't necessarily run all of them through
														
 
															+        # this particular server in our path. It is difficult to estimate how many blocks we'll use at this stage,
														
 
															+        # so we assume that we'll use all of them (the worst case for the cache size) and get a pessimistic estimate.
														
 
															+        # This is okay since false positives are more costly than false negatives here.
														
 
															+        return cache_tokens_needed * 2 * span.length <= span.server_info.cache_tokens_left
														
 
															+
														
 
															+    def _make_sequence_with_max_throughput(self, start_index: int, end_index: int) -> List[RemoteSpanInfo]:
														
 
															         span_sequence = []
														
 
															         current_index = start_index
														
 
															         while current_index < end_index:
														
@@ -150,20 +300,12 @@ class RemoteSequenceManager:
 
															             if not candidate_spans:
														
 
															                 raise MissingBlocksError(current_index)
														
 
															-            if mode == "max_throughput":
														
 
															-                span_weights = np.array([span.server_info.throughput for span in candidate_spans], dtype=np.float64)
														
 
															-            elif mode == "min_latency":
														
 
															-                span_weights = np.array([span.end - current_index for span in candidate_spans], dtype=np.float64)
														
 
															-            else:
														
 
															-                raise RuntimeError(f"Unexpected mode {mode}")
														
 
															+            span_weights = np.array([span.server_info.throughput for span in candidate_spans], dtype=np.float64)
														
 
															             chosen_span = np.random.choice(candidate_spans, p=span_weights / span_weights.sum())
														
 
															             assert chosen_span.start <= current_index < chosen_span.end
														
 
															             span_sequence.append(dataclasses.replace(chosen_span, start=current_index))
														
 
															             current_index = chosen_span.end
														
 
															-
														
 
															-        route_repr = " => ".join([f"{span.start}:{span.end} via …{str(span.peer_id)[-6:]}" for span in span_sequence])
														
 
															-        logger.debug(f"Route found: {route_repr}")
														
 
															         return span_sequence
														
 
															     def __getitem__(self, ix: Union[int, slice]) -> RemoteSequenceManager:
														
@@ -182,10 +324,10 @@ class RemoteSequenceManager:
 
															     def _update(self):
														
 
															         """Perform an immediate and synchronous refresh, may take time"""
														
 
															+
														
 
															         new_block_infos = petals.dht_utils.get_remote_module_infos(
														
 
															-            self.dht, self.block_uids, active_adapter=self.config.active_adapter, latest=self._need_latest_infos
														
 
															+            self.dht, self.block_uids, active_adapter=self.config.active_adapter, latest=True
														
 
															         )
														
 
															-        self._need_latest_infos = True  # All future _update() should use latest infos
														
 
															         for block_info in new_block_infos:
														
 
															             if not block_info:
														
@@ -217,6 +359,14 @@ class RemoteSequenceManager:
 
															         with self.lock_changes:
														
 
															             self.state.sequence_info.update_(new_block_infos)
														
 
															+
														
 
															+            first_servers = [span.peer_id for span in self.state.sequence_info.spans_containing_block[0]]
														
 
															+            last_servers = [span.peer_id for span in self.state.sequence_info.spans_containing_block[-1]]
														
 
															+
														
 
															+        pinged_servers = set(sample_up_to(first_servers, self.config.max_pinged))
														
 
															+        pinged_servers |= set(sample_up_to(last_servers, self.config.max_pinged))
														
 
															+        self.ping_aggregator.ping(list(pinged_servers), wait_timeout=self.config.ping_timeout)
														
 
															+
														
 
															         self.ready.set()
														
 
															     def on_request_failure(self, peer_id: Optional[PeerID]):
														
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -32,6 +32,7 @@ from petals.server.throughput import get_dtype_name, get_server_throughput
 
															 from petals.utils.auto_config import AutoDistributedConfig
														
 
															 from petals.utils.convert_block import QuantType, check_device_balance, convert_block
														
 
															 from petals.utils.ping import PingAggregator
														
 
															+from petals.utils.random import sample_up_to
														
 
															 from petals.utils.version import get_compatible_model_repo
														
 
															 logger = get_logger(__name__)
														
@@ -61,7 +62,7 @@ class Server:
 
															         cache_dir: Optional[str] = None,
														
 
															         max_disk_space: Optional[int] = None,
														
 
															         attn_cache_tokens: int = 8192,
														
 
															-        alloc_timeout: float = 60,
														
 
															+        alloc_timeout: float = 5,
														
 
															         device: Optional[Union[str, torch.device]] = None,
														
 
															         compression=CompressionType.NONE,
														
 
															         stats_report_interval: Optional[int] = None,
														
@@ -637,7 +638,6 @@ class ModuleAnnouncerThread(threading.Thread):
 
															         update_period: float,
														
 
															         expiration: float,
														
 
															         max_pinged: int = 5,
														
 
															-        max_reported: int = 10,
														
 
															         **kwargs,
														
 
															     ):
														
 
															         super().__init__(**kwargs)
														
@@ -650,10 +650,11 @@ class ModuleAnnouncerThread(threading.Thread):
 
															         self.expiration = expiration
														
 
															         self.trigger = threading.Event()
														
 
															-        self.max_pinged, self.max_reported = max_pinged, max_reported
														
 
															-        last_uid = max(module_uids, key=lambda uid: int(uid.split(UID_DELIMITER)[-1]))
														
 
															-        dht_prefix, block_index = last_uid.split(UID_DELIMITER)
														
 
															-        self.next_uid = f"{dht_prefix}{UID_DELIMITER}{int(block_index) + 1}"
														
 
															+        self.max_pinged = max_pinged
														
 
															+        dht_prefix = module_uids[0].split(UID_DELIMITER)[0]
														
 
															+        block_indices = [int(uid.split(UID_DELIMITER)[-1]) for uid in module_uids]
														
 
															+        start_block, end_block = min(block_indices), max(block_indices) + 1
														
 
															+        self.next_uids = [f"{dht_prefix}{UID_DELIMITER}{i}" for i in range(start_block + 1, end_block + 1)]
														
 
															         self.ping_aggregator = PingAggregator(self.dht)
														
 
															     def run(self) -> None:
														
@@ -664,7 +665,7 @@ class ModuleAnnouncerThread(threading.Thread):
 
															             if self.server_info.state != ServerState.OFFLINE:
														
 
															                 self._ping_next_servers()
														
 
															                 self.server_info.next_pings = {
														
 
															-                    peer_id.to_base58(): rtt for peer_id, rtt in self.ping_aggregator.fastest(self.max_reported).items()
														
 
															+                    peer_id.to_base58(): rtt for peer_id, rtt in self.ping_aggregator.to_dict().items()
														
 
															                 }
														
 
															             else:
														
 
															                 self.server_info.next_pings = None  # No need to ping if we're disconnecting
														
@@ -691,14 +692,14 @@ class ModuleAnnouncerThread(threading.Thread):
 
															             self.join()
														
 
															     def _ping_next_servers(self) -> Dict[hivemind.PeerID, float]:
														
 
															-        [module_info] = get_remote_module_infos(self.dht, [self.next_uid], latest=True)
														
 
															-        if module_info is None:
														
 
															-            return
														
 
															-
														
 
															-        next_servers = list(module_info.servers)
														
 
															-        if len(next_servers) > self.max_pinged:
														
 
															-            next_servers = random.sample(next_servers, self.max_pinged)
														
 
															-        self.ping_aggregator.ping(next_servers)
														
 
															+        module_infos = get_remote_module_infos(self.dht, self.next_uids, latest=True)
														
 
															+        middle_servers = {peer_id for info in module_infos[:-1] if info is not None for peer_id in info.servers}
														
 
															+        pinged_servers = set(sample_up_to(middle_servers, self.max_pinged))
														
 
															+        pinged_servers.discard(self.dht.peer_id)
														
 
															+        if module_infos[-1] is not None:
														
 
															+            # Sample servers hosting the block after the last one (most likely continuations) separately
														
 
															+            pinged_servers |= set(sample_up_to(module_infos[-1].servers, self.max_pinged))
														
 
															+        self.ping_aggregator.ping(list(pinged_servers))
														
 
															 class RuntimeWithDeduplicatedPools(Runtime):
														
--- a/src/petals/utils/ping.py
+++ b/src/petals/utils/ping.py
@@ -1,5 +1,6 @@
 
															 import asyncio
														
 
															 import math
														
 
															+import threading
														
 
															 import time
														
 
															 from functools import partial
														
 
															 from typing import Dict, Sequence
														
@@ -34,27 +35,27 @@ async def ping_parallel(peer_ids: Sequence[hivemind.PeerID], *args, **kwargs) ->
 
															 class PingAggregator:
														
 
															-    def __init__(self, dht: hivemind.DHT, *, ema_alpha: float = 0.2, expiration: float = 3600):
														
 
															+    def __init__(self, dht: hivemind.DHT, *, ema_alpha: float = 0.2, expiration: float = 300):
														
 
															         self.dht = dht
														
 
															         self.ema_alpha = ema_alpha
														
 
															         self.expiration = expiration
														
 
															         self.ping_emas = hivemind.TimedStorage()
														
 
															+        self.lock = threading.Lock()
														
 
															-    def ping(self, peer_ids: Sequence[hivemind.PeerID], **kwargs):
														
 
															+    def ping(self, peer_ids: Sequence[hivemind.PeerID], **kwargs) -> None:
														
 
															         current_rtts = self.dht.run_coroutine(partial(ping_parallel, peer_ids, **kwargs))
														
 
															         logger.debug(f"Current RTTs: {current_rtts}")
														
 
															-        expiration = hivemind.get_dht_time() + self.expiration
														
 
															-        for peer_id, rtt in current_rtts.items():
														
 
															-            prev_rtt = self.ping_emas.get(peer_id)
														
 
															-            if prev_rtt is not None and prev_rtt.value != math.inf:
														
 
															-                rtt = self.ema_alpha * rtt + (1 - self.ema_alpha) * prev_rtt.value  # Exponential smoothing
														
 
															-            self.ping_emas.store(peer_id, rtt, expiration)
														
 
															+        with self.lock:
														
 
															+            expiration = hivemind.get_dht_time() + self.expiration
														
 
															+            for peer_id, rtt in current_rtts.items():
														
 
															+                prev_rtt = self.ping_emas.get(peer_id)
														
 
															+                if prev_rtt is not None and prev_rtt.value != math.inf:
														
 
															+                    rtt = self.ema_alpha * rtt + (1 - self.ema_alpha) * prev_rtt.value  # Exponential smoothing
														
 
															+                self.ping_emas.store(peer_id, rtt, expiration)
														
 
															-    def fastest(self, n_peers: int) -> Dict[hivemind.PeerID, float]:
														
 
															-        with self.ping_emas.freeze():
														
 
															+    def to_dict(self) -> Dict[hivemind.PeerID, float]:
														
 
															+        with self.lock, self.ping_emas.freeze():
														
 
															             smoothed_rtts = {peer_id: rtt.value for peer_id, rtt in self.ping_emas.items()}
														
 
															-        logger.debug(f"Smothed RTTs: {smoothed_rtts}")
														
 
															-
														
 
															-        fastest_rtts = sorted(smoothed_rtts.items(), key=lambda item: item[1])[:n_peers]
														
 
															-        return dict(fastest_rtts)
														
 
															+            logger.debug(f"Smothed RTTs: {smoothed_rtts}")
														
 
															+            return smoothed_rtts
														
--- a/src/petals/utils/random.py
+++ b/src/petals/utils/random.py
@@ -0,0 +1,12 @@
 
															+import random
														
 
															+from typing import Collection, TypeVar
														
 
															+
														
 
															+T = TypeVar("T")
														
 
															+
														
 
															+
														
 
															+def sample_up_to(population: Collection[T], k: int) -> T:
														
 
															+    if not isinstance(population, list):
														
 
															+        population = list(population)
														
 
															+    if len(population) > k:
														
 
															+        population = random.sample(population, k)
														
 
															+    return population