3 years ago · 50535a8435
--- a/cli/run_server.py
+++ b/cli/run_server.py
@@ -31,15 +31,19 @@ def main():
 
															     parser.add_argument('--num_handlers', type=int, default=8, required=False,
														
 
															                         help='server will use this many processes to handle incoming requests')
														
 
															     parser.add_argument('--min_batch_size', type=int, default=1,
														
 
															-                        help='Minimum required batch size for all expert operations')
														
 
															+                        help='Minimum required batch size for all operations (in total tokens)')
														
 
															     parser.add_argument('--max_batch_size', type=int, default=16384,
														
 
															                         help='The total number of tokens in the same batch will not exceed this value')
														
 
															+    parser.add_argument('--prefetch_batches', type=int, default=1, required=False,
														
 
															+                        help='Pre-form this many subsequent batches while GPU is processing the current one')
														
 
															+    parser.add_argument('--sender_threads', type=int, default=1, required=False,
														
 
															+                        help='Use this many threads to pass results/exceptions from Runtime to Pools')
														
 
															     parser.add_argument('--inference_max_length', type=int, default=16384,
														
 
															                         help='Maximum total sequence length permitted per inference, defaults to 16384 tokens')
														
 
															     parser.add_argument('--cache_dir', type=str, default=None, 
														
 
															                         help='Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.')
														
 
															     parser.add_argument('--device', type=str, default=None, required=False,
														
 
															-                        help='all experts will use this device in torch notation; default: cuda if available else cpu')
														
 
															+                        help='all blocks will use this device in torch notation; default: cuda if available else cpu')
														
 
															     parser.add_argument("--torch_dtype", type=str, default="auto",
														
 
															                         help="Use this dtype to store block weights and do computations. "
														
 
															                              "By default, respect the dtypes in the pre-trained state dict.")
														
@@ -58,7 +62,7 @@ def main():
 
															                              'on the first run and uses these estimates for future runs. '
														
 
															                              'If set to "eval", the script re-evaluates the throughput and overrides the cache.')
														
 
															     parser.add_argument('--update_period', type=float, required=False, default=30,
														
 
															-                        help='Server will report experts to DHT once in this many seconds')
														
 
															+                        help='Server will report blocks to DHT once in this many seconds')
														
 
															     parser.add_argument('--expiration', type=float, required=False, default=None,
														
 
															                         help='DHT entries will expire after this many seconds')
														
 
															     parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[],
														
--- a/src/client/__init__.py
+++ b/src/client/__init__.py
@@ -2,3 +2,4 @@ from src.client.inference_session import RemoteSequentialInferenceSession, Remot
 
															 from src.client.remote_model import DistributedBloomConfig, DistributedBloomForCausalLM, DistributedBloomModel
														
 
															 from src.client.remote_sequential import RemoteSequential, RemoteTransformerBlock
														
 
															 from src.client.sequence_manager import RemoteSequenceManager
														
 
															+from src.client.spending_policy import NoSpendingPolicy, SpendingPolicyBase
														
--- a/src/client/inference_session.py
+++ b/src/client/inference_session.py
@@ -43,6 +43,7 @@ class RemoteTransformerBlockInferenceSession:
 
															         outputs_aiter: AsyncIterator,
														
 
															         *,
														
 
															         max_length: int,
														
 
															+        points: int = 0,
														
 
															     ):
														
 
															         self.uid, self.rpc_info = uid, rpc_info
														
 
															         self.num_blocks = uid.count(CHAIN_DELIMITER) + 1
														
@@ -50,7 +51,7 @@ class RemoteTransformerBlockInferenceSession:
 
															         # using them in any other EventLoop may cause side-effects including, headaches, diarrhea, and loss of sleep
														
 
															         self._inputs_queue: asyncio.Queue[runtime_pb2.ExpertRequest] = inputs_queue
														
 
															         self._outputs_stream: AsyncIterator[runtime_pb2.ExpertResponse] = outputs_aiter
														
 
															-        self._serialized_metadata = MSGPackSerializer.dumps(dict(max_length=max_length))
														
 
															+        self._serialized_metadata = MSGPackSerializer.dumps(dict(max_length=max_length, points=points))
														
 
															         self.stepped = False
														
 
															         self.closed = False
														
--- a/src/client/remote_forward_backward.py
+++ b/src/client/remote_forward_backward.py
@@ -0,0 +1,156 @@
 
															+"""
														
 
															+Utility functions that call RPC forward or backward on a single remote server
														
 
															+"""
														
 
															+import asyncio
														
 
															+from typing import Iterable, List, Sequence, Tuple
														
 
															+
														
 
															+import torch
														
 
															+from hivemind import nested_compare, nested_flatten, nested_pack, serialize_torch_tensor
														
 
															+from hivemind.compression.serialization import deserialize_tensor_stream, deserialize_torch_tensor
														
 
															+from hivemind.p2p import StubBase
														
 
															+from hivemind.p2p.p2p_daemon_bindings.control import DEFAULT_MAX_MSG_SIZE, MAX_UNARY_PAYLOAD_SIZE
														
 
															+from hivemind.proto import runtime_pb2
														
 
															+from hivemind.utils.asyncio import amap_in_executor, iter_as_aiter
														
 
															+from hivemind.utils.streaming import split_for_streaming
														
 
															+
														
 
															+from src.data_structures import ModuleUID, RPCInfo
														
 
															+
														
 
															+
														
 
															+async def run_remote_forward(
														
 
															+    uid: ModuleUID, stub: StubBase, rpc_info: RPCInfo, *inputs: torch.Tensor, metadata: bytes = b"", **kwargs
														
 
															+) -> Tuple[torch.Tensor, ...]:
														
 
															+    """
														
 
															+    Serializes input tensors and calls "rpc_forward" on a remote server.
														
 
															+    Mostly adapted from https://github.com/learning-at-home/hivemind/blob/7a7c93aefffc9494c39e7b170c07cb06d8c09c4c/hivemind/moe/client/expert.py#L198
														
 
															+    but without RemoteExpertWorker.run_coroutine() call that leads to deadlock here.
														
 
															+    """
														
 
															+
														
 
															+    # Note: *inputs are flattened input tensors that follow the expert's info['input_schema']
														
 
															+    # detach to avoid pickling the computation graph
														
 
															+    assert len(kwargs) == len(rpc_info["keyword_names"]), f"Keyword args should be {rpc_info['keyword_names']}"
														
 
															+    kwargs = {key: kwargs[key] for key in rpc_info["keyword_names"]}
														
 
															+
														
 
															+    # Note: we put keyword arguments in the same order as on a server to prevent f(a=1, b=2) != f(b=2, a=1) errors
														
 
															+    forward_inputs = (inputs, kwargs)
														
 
															+
														
 
															+    # Modify forward_schema to support prompts
														
 
															+    args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															+    # TODO: rm this assert when support arbitrary number of input tensors
														
 
															+    assert len(args_schema) == 1 and len(inputs) == 2
														
 
															+    forward_schema_with_prompts = (tuple(args_schema * len(inputs)), kwargs_schema)
														
 
															+
														
 
															+    if not nested_compare(forward_inputs, forward_schema_with_prompts):
														
 
															+        raise TypeError(f"Inputs do not match expert input schema. Did you pass the right number of parameters?")
														
 
															+
														
 
															+    forward_inputs = nested_flatten(forward_inputs)
														
 
															+    inputs = tuple(tensor.cpu().detach() for tensor in forward_inputs)
														
 
															+
														
 
															+    # Asynchronous serialization
														
 
															+    loop = asyncio.get_running_loop()
														
 
															+    serialized_tensors = await asyncio.gather(
														
 
															+        *(
														
 
															+            loop.run_in_executor(None, serialize_torch_tensor, tensor.to(proto.dtype), proto.compression)
														
 
															+            for tensor, proto in zip(inputs, nested_flatten(forward_schema_with_prompts))
														
 
															+        )
														
 
															+    )
														
 
															+
														
 
															+    # call RPC on remote server
														
 
															+    size = sum(t.element_size() * t.nelement() for t in inputs)
														
 
															+    if size > MAX_UNARY_PAYLOAD_SIZE:
														
 
															+        deserialized_outputs = await _forward_stream(uid, serialized_tensors, stub, **kwargs)
														
 
															+    else:
														
 
															+        deserialized_outputs = await _forward_unary(uid, serialized_tensors, stub, **kwargs)
														
 
															+
														
 
															+    return nested_pack(deserialized_outputs, structure=rpc_info["outputs_schema"])
														
 
															+
														
 
															+
														
 
															+async def _forward_stream(
														
 
															+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, **kwargs
														
 
															+) -> List[torch.Tensor]:
														
 
															+    split = (p for t in serialized_tensors for p in split_for_streaming(t, DEFAULT_MAX_MSG_SIZE))
														
 
															+
														
 
															+    outputs = await stub.rpc_forward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor: runtime_pb2.ExpertRequest(uid=uid, tensors=[tensor], **kwargs),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+
														
 
															+    tensors_stream = amap_in_executor(lambda msg: msg.tensors, outputs)
														
 
															+    return await deserialize_tensor_stream(tensors_stream)
														
 
															+
														
 
															+
														
 
															+async def _forward_unary(
														
 
															+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, **kwargs
														
 
															+) -> List[torch.Tensor]:
														
 
															+    outputs: runtime_pb2.ExpertResponse = await stub.rpc_forward(
														
 
															+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors), **kwargs)
														
 
															+    )
														
 
															+    return [deserialize_torch_tensor(t) for t in outputs.tensors]
														
 
															+
														
 
															+
														
 
															+async def _backward_stream(
														
 
															+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, **kwargs
														
 
															+) -> List[torch.Tensor]:
														
 
															+    split = (part for tensor in serialized_tensors for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE))
														
 
															+
														
 
															+    grad_inputs = await stub.rpc_backward_stream(
														
 
															+        amap_in_executor(
														
 
															+            lambda tensor: runtime_pb2.ExpertRequest(uid=uid, tensors=[tensor], **kwargs),
														
 
															+            iter_as_aiter(split),
														
 
															+        ),
														
 
															+    )
														
 
															+    tensors_stream = amap_in_executor(lambda msg: msg.tensors, grad_inputs)
														
 
															+    return await deserialize_tensor_stream(tensors_stream)
														
 
															+
														
 
															+
														
 
															+async def run_remote_backward(
														
 
															+    uid: ModuleUID,
														
 
															+    stub: StubBase,
														
 
															+    rpc_info: RPCInfo,
														
 
															+    inputs: torch.Tensor,
														
 
															+    grad_outputs: List[torch.Tensor],
														
 
															+    *extra_tensors: torch.Tensor,
														
 
															+    **kwargs,
														
 
															+) -> Sequence[torch.Tensor]:
														
 
															+    """
														
 
															+    Serializes grad outputs and calls "rpc_backward" on a remote server.
														
 
															+    Mostly adapted from https://github.com/learning-at-home/hivemind/blob/7a7c93aefffc9494c39e7b170c07cb06d8c09c4c/hivemind/moe/client/expert.py#L221
														
 
															+    but without RemoteExpertWorker.run_coroutine() call that leads to deadlock here.
														
 
															+    """
														
 
															+
														
 
															+    grad_outputs_cpu = tuple(tensor.cpu() for tensor in grad_outputs)
														
 
															+    inputs_and_grad_outputs = tuple(nested_flatten((inputs, grad_outputs_cpu, *extra_tensors)))
														
 
															+
														
 
															+    # Modify forward_schema to support prompts
														
 
															+    args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															+    assert len(args_schema) == 1 and isinstance(inputs, torch.Tensor)
														
 
															+    # TODO generalize this
														
 
															+    prompts_schema = next(iter(args_schema))
														
 
															+    backward_schema = tuple(nested_flatten((rpc_info["forward_schema"], rpc_info["outputs_schema"], prompts_schema)))
														
 
															+
														
 
															+    # Asynchronous serialization
														
 
															+    loop = asyncio.get_running_loop()
														
 
															+    serialized_tensors = await asyncio.gather(
														
 
															+        *(
														
 
															+            loop.run_in_executor(None, serialize_torch_tensor, tensor.to(proto.dtype), proto.compression)
														
 
															+            for tensor, proto in zip(inputs_and_grad_outputs, backward_schema)
														
 
															+        )
														
 
															+    )
														
 
															+
														
 
															+    size = sum(t.element_size() * t.nelement() for t in inputs_and_grad_outputs)
														
 
															+    if size > MAX_UNARY_PAYLOAD_SIZE:
														
 
															+        deserialized_grad_inputs = await _backward_stream(uid, serialized_tensors, stub, **kwargs)
														
 
															+    else:
														
 
															+        deserialized_grad_inputs = await _backward_unary(uid, serialized_tensors, stub, **kwargs)
														
 
															+
														
 
															+    return deserialized_grad_inputs
														
 
															+
														
 
															+
														
 
															+async def _backward_unary(
														
 
															+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, **kwargs
														
 
															+) -> List[torch.Tensor]:
														
 
															+    grad_inputs: runtime_pb2.ExpertResponse = await stub.rpc_backward(
														
 
															+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors), **kwargs)
														
 
															+    )
														
 
															+    return [deserialize_torch_tensor(t) for t in grad_inputs.tensors]
														
--- a/src/client/sequence_manager.py
+++ b/src/client/sequence_manager.py
@@ -9,6 +9,7 @@ from hivemind.moe.client.remote_expert_worker import RemoteExpertWorker
 
															 from hivemind.proto import runtime_pb2
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															+from src.client.spending_policy import NoSpendingPolicy
														
 
															 from src.data_structures import ModuleUID, RemoteModuleInfo, RemoteSpanInfo, ServerState
														
 
															 from src.dht_utils import get_remote_module_infos
														
 
															 from src.server.handler import TransformerConnectionHandler
														
@@ -24,6 +25,7 @@ class RemoteSequenceManager:
 
															     """
														
 
															     def __init__(self, dht: DHT, block_uids: Sequence[ModuleUID], p2p: P2P, max_retries: int = 3):
														
 
															+        assert len(block_uids) > 0, "Sequences must contain at least one block"
														
 
															         self.dht, self.p2p = dht, p2p
														
 
															         self.block_uids: List[ModuleUID] = list(block_uids)
														
 
															         self.block_infos: List[Optional[RemoteModuleInfo]] = [None] * len(self.block_uids)
														
@@ -39,7 +41,7 @@ class RemoteSequenceManager:
 
															             assert info is not None, f"Found no remote peers for block {uid}"
														
 
															         assert self.spans_by_priority and self.spans_containing_block
														
 
															-    def make_sequence(self, start_index: int = 0, end_index: Optional[int] = None) -> Sequence[RemoteSpanInfo]:
														
 
															+    def make_sequence(self, start_index: int = 0, end_index: Optional[int] = None) -> List[RemoteSpanInfo]:
														
 
															         """
														
 
															         Form a sequence of remote servers that collectively serve all consecutive layers
														
--- a/src/client/sequential_autograd.py
+++ b/src/client/sequential_autograd.py
@@ -1,102 +1,22 @@
 
															+"""
														
 
															+A PyTorch autograd function that runs forward/backward on a sequence of remote servers in a fault-tolerant manner
														
 
															+"""
														
 
															 import asyncio
														
 
															 import logging
														
 
															 from typing import List, Optional, Sequence, Tuple
														
 
															 import torch
														
 
															-from hivemind import serialize_torch_tensor
														
 
															-from hivemind.moe.client.expert import expert_backward, expert_forward
														
 
															 from hivemind.moe.client.remote_expert_worker import RemoteExpertWorker
														
 
															-from hivemind.p2p import StubBase
														
 
															-from hivemind.utils.nested import nested_compare, nested_flatten, nested_pack
														
 
															+from src.client.remote_forward_backward import run_remote_backward, run_remote_forward
														
 
															 from src.client.sequence_manager import RemoteSequenceManager
														
 
															-from src.data_structures import CHAIN_DELIMITER, ModuleUID, RemoteSpanInfo, RPCInfo
														
 
															+from src.data_structures import CHAIN_DELIMITER, RemoteSpanInfo
														
 
															 from src.server.handler import TransformerConnectionHandler
														
 
															 from src.utils.misc import DUMMY, is_dummy
														
 
															 MAX_TOKENS_IN_BATCH = 1024
														
 
															-async def run_expert_forward(
														
 
															-    uid: ModuleUID, stub: StubBase, rpc_info: RPCInfo, *inputs: torch.Tensor, **kwargs
														
 
															-) -> Tuple[torch.Tensor, ...]:
														
 
															-    """
														
 
															-    Serializes input tensors and calls "expert_forward".
														
 
															-    Mostly adapted from https://github.com/learning-at-home/hivemind/blob/7a7c93aefffc9494c39e7b170c07cb06d8c09c4c/hivemind/moe/client/expert.py#L198
														
 
															-    but without RemoteExpertWorker.run_coroutine() call that leads to deadlock here.
														
 
															-    """
														
 
															-
														
 
															-    # Note: *inputs are flattened input tensors that follow the expert's info['input_schema']
														
 
															-    # detach to avoid pickling the computation graph
														
 
															-    assert len(kwargs) == len(rpc_info["keyword_names"]), f"Keyword args should be {rpc_info['keyword_names']}"
														
 
															-    kwargs = {key: kwargs[key] for key in rpc_info["keyword_names"]}
														
 
															-
														
 
															-    # Note: we put keyword arguments in the same order as on a server to prevent f(a=1, b=2) != f(b=2, a=1) errors
														
 
															-    forward_inputs = (inputs, kwargs)
														
 
															-
														
 
															-    # Modify forward_schema to support prompts
														
 
															-    args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															-    # TODO: rm this assert when support arbitrary number of input tensors
														
 
															-    assert len(args_schema) == 1 and len(inputs) == 2
														
 
															-    forward_schema_with_prompts = (tuple(args_schema * len(inputs)), kwargs_schema)
														
 
															-
														
 
															-    if not nested_compare(forward_inputs, forward_schema_with_prompts):
														
 
															-        raise TypeError(f"Inputs do not match expert input schema. Did you pass the right number of parameters?")
														
 
															-
														
 
															-    forward_inputs = nested_flatten(forward_inputs)
														
 
															-    inputs = tuple(tensor.cpu().detach() for tensor in forward_inputs)
														
 
															-
														
 
															-    # Asynchronous serialization
														
 
															-    loop = asyncio.get_running_loop()
														
 
															-    serialized_tensors = await asyncio.gather(
														
 
															-        *(
														
 
															-            loop.run_in_executor(None, serialize_torch_tensor, tensor.to(proto.dtype), proto.compression)
														
 
															-            for tensor, proto in zip(inputs, nested_flatten(forward_schema_with_prompts))
														
 
															-        )
														
 
															-    )
														
 
															-
														
 
															-    deserialized_outputs = await expert_forward(uid, inputs, serialized_tensors, stub)
														
 
															-    flat_outputs = tuple(deserialized_outputs)
														
 
															-    return nested_pack(flat_outputs, structure=rpc_info["outputs_schema"])
														
 
															-
														
 
															-
														
 
															-async def run_expert_backward(
														
 
															-    uid: ModuleUID,
														
 
															-    stub: StubBase,
														
 
															-    rpc_info: RPCInfo,
														
 
															-    inputs: torch.Tensor,
														
 
															-    grad_outputs: List[torch.Tensor],
														
 
															-    *extra_tensors: torch.Tensor,
														
 
															-) -> Sequence[torch.Tensor]:
														
 
															-    """
														
 
															-    Serializes grad outputs and calls "expert_backward".
														
 
															-    Mostly adapted from https://github.com/learning-at-home/hivemind/blob/7a7c93aefffc9494c39e7b170c07cb06d8c09c4c/hivemind/moe/client/expert.py#L221
														
 
															-    but without RemoteExpertWorker.run_coroutine() call that leads to deadlock here.
														
 
															-    """
														
 
															-
														
 
															-    grad_outputs_cpu = tuple(tensor.cpu() for tensor in grad_outputs)
														
 
															-    inputs_and_grad_outputs = tuple(nested_flatten((inputs, grad_outputs_cpu, *extra_tensors)))
														
 
															-
														
 
															-    # Modify forward_schema to support prompts
														
 
															-    args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															-    assert len(args_schema) == 1 and isinstance(inputs, torch.Tensor)
														
 
															-    # TODO generalize this
														
 
															-    prompts_schema = next(iter(args_schema))
														
 
															-    backward_schema = tuple(nested_flatten((rpc_info["forward_schema"], rpc_info["outputs_schema"], prompts_schema)))
														
 
															-
														
 
															-    # Asynchronous serialization
														
 
															-    loop = asyncio.get_running_loop()
														
 
															-    serialized_tensors = await asyncio.gather(
														
 
															-        *(
														
 
															-            loop.run_in_executor(None, serialize_torch_tensor, tensor.to(proto.dtype), proto.compression)
														
 
															-            for tensor, proto in zip(inputs_and_grad_outputs, backward_schema)
														
 
															-        )
														
 
															-    )
														
 
															-
														
 
															-    deserialized_grad_inputs = await expert_backward(uid, inputs_and_grad_outputs, serialized_tensors, stub)
														
 
															-    return deserialized_grad_inputs
														
 
															-
														
 
															-
														
 
															 async def sequential_forward(
														
 
															     inputs: torch.Tensor,
														
 
															     prompts: torch.Tensor,
														
@@ -121,16 +41,17 @@ async def sequential_forward(
 
															     sequences = sequence_manager.make_sequence(start_index, end_index)
														
 
															     intermediate_inputs = []
														
 
															     done_sequences = []
														
 
															+    outputs = inputs
														
 
															     while len(sequences) > 0:
														
 
															         while True:
														
 
															+            span = sequences.pop(0)
														
 
															+            span_uids: str = CHAIN_DELIMITER.join(sequence_manager.block_uids[span.start : span.end])
														
 
															             try:
														
 
															-                span = sequences.pop(0)
														
 
															-                span_uids: str = CHAIN_DELIMITER.join(sequence_manager.block_uids[span.start : span.end])
														
 
															                 stub = TransformerConnectionHandler.get_stub(sequence_manager.p2p, span.peer_id)
														
 
															                 inputs_and_prompts = [inputs, prompts[span.start : span.end]]
														
 
															-                (outputs,) = await run_expert_forward(span_uids, stub, sequence_manager.rpc_info, *inputs_and_prompts)
														
 
															+                (outputs,) = await run_remote_forward(span_uids, stub, sequence_manager.rpc_info, *inputs_and_prompts)
														
 
															                 assert isinstance(outputs, torch.Tensor)
														
 
															                 assert outputs.shape == inputs.shape, f"Expected output {inputs.shape}, got {outputs.shape}"
														
@@ -171,7 +92,7 @@ async def sequential_backward(
 
															             span_uids: str = CHAIN_DELIMITER.join(sequence_manager.block_uids[span.start : span.end])
														
 
															             try:
														
 
															                 stub = TransformerConnectionHandler.get_stub(sequence_manager.p2p, span.peer_id)
														
 
															-                grad_outputs, *span_grad_prompts = await run_expert_backward(
														
 
															+                grad_outputs, *span_grad_prompts = await run_remote_backward(
														
 
															                     span_uids, stub, sequence_manager.rpc_info, inputs, grad_outputs, prompts[span.start : span.end]
														
 
															                 )
														
 
															                 grad_outputs = [grad_outputs]
														
--- a/src/client/spending_policy.py
+++ b/src/client/spending_policy.py
@@ -0,0 +1,14 @@
 
															+from abc import ABC, abstractmethod
														
 
															+
														
 
															+from hivemind.proto.runtime_pb2 import ExpertRequest
														
 
															+
														
 
															+
														
 
															+class SpendingPolicyBase(ABC):
														
 
															+    @abstractmethod
														
 
															+    def get_points(self, request: ExpertRequest, method_name: str, *args, **kwargs) -> float:
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+class NoSpendingPolicy(SpendingPolicyBase):
														
 
															+    def get_points(self, request: ExpertRequest, method_name: str, *args, **kwargs) -> float:
														
 
															+        return 0.0
														
--- a/src/server/backend.py
+++ b/src/server/backend.py
@@ -1,45 +1,20 @@
 
															 """Code for serving bloom blocks via hivemind-server"""
														
 
															-from queue import Empty
														
 
															 from typing import Any, Dict, Optional, Sequence, Tuple
														
 
															 import torch
														
 
															 from hivemind import BatchTensorDescriptor, use_hivemind_log_handler
														
 
															 from hivemind.moe.server.module_backend import ModuleBackend
														
 
															-from hivemind.moe.server.task_pool import TaskPool
														
 
															-from hivemind.utils import InvalidStateError, get_logger
														
 
															+from hivemind.utils import get_logger
														
 
															 from src.bloom.from_pretrained import BloomBlock
														
 
															 from src.server.cache import MemoryCache
														
 
															+from src.server.task_pool import PrioritizedTaskPool
														
 
															 from src.utils.misc import is_dummy
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
 
															-class InferenceTaskPool(TaskPool):
														
 
															-    def __init__(self, *args, **kwargs):
														
 
															-        super().__init__(*args, **kwargs)
														
 
															-
														
 
															-        assert self.min_batch_size == 1, "min_batch_size in InferenceTaskPool cannot be greater 1"
														
 
															-
														
 
															-    def iterate_minibatches(self, *args, **kwargs):
														
 
															-        """Form minibatches by grouping one or more tasks together up to self.max_batch_size"""
														
 
															-
														
 
															-        while True:
														
 
															-            try:
														
 
															-                logger.debug(f"{self.name} getting next task")
														
 
															-                task = self.tasks.get(timeout=self.timeout)
														
 
															-            except Empty:
														
 
															-                logger.warning(f"Timeout reached but batch doesn't contain >={self.min_batch_size} elements yet")
														
 
															-                continue
														
 
															-
														
 
															-            try:
														
 
															-                if task.future.set_running_or_notify_cancel():
														
 
															-                    yield [task]
														
 
															-            except InvalidStateError as e:
														
 
															-                logger.debug(f"Failed to add task to batch: {task.future} raised {e}")
														
 
															-
														
 
															-
														
 
															 class TransformerBackend(ModuleBackend):
														
 
															     """A wrapper for BloomBlock that can process requests for bloom layer forward, forward_incremental, and backward"""
														
@@ -52,8 +27,15 @@ class TransformerBackend(ModuleBackend):
 
															         for name, buf in self.module.named_buffers():
														
 
															             assert not buf.requires_grad, f"Bloom layer parameters must not accumulate gradients, but {name} does"
														
 
															-        self.inference_pool = InferenceTaskPool(
														
 
															-            self.inference_step, max_batch_size=self.forward_pool.max_batch_size, name=f"{self.name}_inference"
														
 
															+        max_batch_size = self.forward_pool.max_batch_size
														
 
															+        self.inference_pool = PrioritizedTaskPool(
														
 
															+            self.inference_step, max_batch_size=max_batch_size, name=f"{self.name}_inference"
														
 
															+        )
														
 
															+        self.forward_pool = PrioritizedTaskPool(
														
 
															+            self.forward, max_batch_size=max_batch_size, name=f"{self.name}_forward"
														
 
															+        )
														
 
															+        self.backward_pool = PrioritizedTaskPool(
														
 
															+            self.backward, max_batch_size=max_batch_size, name=f"{self.name}_backward"
														
 
															         )
														
 
															         self.dtype = backend_dtype if backend_dtype else self.module.input_layernorm.weight.dtype
														
 
															         self.inference_schema = (
														
@@ -94,9 +76,9 @@ class TransformerBackend(ModuleBackend):
 
															                 cache[1, :, prefix_length:new_length, :] = new_v[:, prefix_length:new_length]
														
 
															                 return (hidden_states,)
														
 
															-    def get_pools(self) -> Sequence[TaskPool]:
														
 
															+    def get_pools(self) -> Sequence[PrioritizedTaskPool]:
														
 
															         return self.forward_pool, self.backward_pool, self.inference_pool
														
 
															     def get_info(self) -> Dict[str, Any]:
														
 
															-        """Get expert parameters and stats. Used by RemoteExpert to check shapes and for DMoE orchestration."""
														
 
															+        """Get module parameters and stats. Used by RemoteExpert to check shapes and for DMoE orchestration."""
														
 
															         return dict(super().get_info(), inference_schema=self.inference_schema)
														
--- a/src/server/handler.py
+++ b/src/server/handler.py
@@ -1,5 +1,5 @@
 
															 import contextlib
														
 
															-from typing import AsyncIterator, Dict, List, Optional, Sequence, Union
														
 
															+from typing import AsyncIterator, Dict, Iterable, List, Sequence, Tuple, Union
														
 
															 import torch
														
 
															 from hivemind import (
														
@@ -7,6 +7,7 @@ from hivemind import (
 
															     MSGPackSerializer,
														
 
															     P2PContext,
														
 
															     TensorDescriptor,
														
 
															+    deserialize_tensor_stream,
														
 
															     deserialize_torch_tensor,
														
 
															     nested_flatten,
														
 
															     serialize_torch_tensor,
														
@@ -14,12 +15,13 @@ from hivemind import (
 
															 from hivemind.moe.server.connection_handler import ConnectionHandler
														
 
															 from hivemind.p2p.p2p_daemon import DEFAULT_MAX_MSG_SIZE
														
 
															 from hivemind.proto import runtime_pb2
														
 
															-from hivemind.utils import as_aiter
														
 
															-from hivemind.utils.asyncio import anext
														
 
															+from hivemind.utils.asyncio import amap_in_executor, anext, as_aiter
														
 
															 from hivemind.utils.streaming import split_for_streaming
														
 
															 from src.data_structures import CHAIN_DELIMITER, ModuleUID
														
 
															 from src.server.backend import TransformerBackend
														
 
															+from src.server.task_pool import PrioritizedTaskPool
														
 
															+from src.server.task_prioritizer import DummyTaskPrioritizer, TaskPrioritizerBase
														
 
															 from src.utils.misc import DUMMY, is_dummy
														
@@ -28,11 +30,41 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															     module_backends: Dict[ModuleUID, TransformerBackend]
														
 
															-    def __init__(self, dht: DHT, module_backends: Dict[str, TransformerBackend], inference_max_length: int):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        dht: DHT,
														
 
															+        module_backends: Dict[str, TransformerBackend],
														
 
															+        inference_max_length: int,
														
 
															+        task_prioritizer: TaskPrioritizerBase = DummyTaskPrioritizer(),
														
 
															+    ):
														
 
															         super().__init__(dht, module_backends)
														
 
															         for module_backend in self.module_backends.values():
														
 
															             assert isinstance(module_backend, TransformerBackend)
														
 
															         self.inference_max_length = inference_max_length
														
 
															+        self._prioritizer = task_prioritizer
														
 
															+
														
 
															+    async def _gather_inputs(
														
 
															+        self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															+    ) -> Tuple[str, List[torch.Tensor], Dict]:
														
 
															+        block_uid, metadata = None, None
														
 
															+
														
 
															+        def _unpack(req: runtime_pb2.ExpertRequest) -> Iterable[runtime_pb2.Tensor]:
														
 
															+            nonlocal block_uid, metadata
														
 
															+
														
 
															+            if block_uid is None:
														
 
															+                block_uid = req.uid
														
 
															+            elif block_uid != req.uid:
														
 
															+                raise ValueError("Block uids differ in one request")
														
 
															+
														
 
															+            if metadata is None:
														
 
															+                metadata = MSGPackSerializer.loads(req.metadata) if req.metadata else {}
														
 
															+
														
 
															+            return req.tensors
														
 
															+
														
 
															+        tensors_stream = amap_in_executor(_unpack, requests)
														
 
															+        inputs = await deserialize_tensor_stream(tensors_stream)
														
 
															+        assert isinstance(block_uid, str) and isinstance(metadata, dict)
														
 
															+        return block_uid, inputs, metadata
														
 
															     async def rpc_inference(
														
 
															         self,
														
@@ -47,13 +79,18 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															             metadata = MSGPackSerializer.loads(request.metadata) if request.metadata else {}
														
 
															             requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															             max_length = metadata.get("max_length")
														
 
															+            points = metadata.get("points", 0)
														
 
															             if not requested_uids:
														
 
															                 raise ValueError("User must specify at least one block for inference, but got none")
														
 
															             assert isinstance(max_length, int), f"rpc_inference metadata must contain int max_length, got {max_length}"
														
 
															+            assert isinstance(
														
 
															+                points, (float, int)
														
 
															+            ), f"rpc_inference should have number of points as a number or None, got {points}"
														
 
															             if not 0 <= max_length <= self.inference_max_length:
														
 
															                 raise ValueError(f"Cannot allocate KV cache for {max_length} tokens, max = {self.inference_max_length}")
														
 
															+            point_per_piece = points / max_length if max_length > 0 else 0.0
														
 
															             batch_size = request.tensors[0].size[0] if request.tensors else 1
														
 
															             cache_metadata = torch.tensor(
														
@@ -98,8 +135,19 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															                         assert (
														
 
															                             hidden_states.ndim == 3
														
 
															                         ), f"inputs to {type(backend)} must be a list with a single 3d tensor of hidden states"
														
 
															+                        assert isinstance(
														
 
															+                            backend.inference_pool, PrioritizedTaskPool
														
 
															+                        ), "petals support only prioritized pools"
														
 
															+                        priority = self._prioritizer.prioritize(
														
 
															+                            cache_metadata,
														
 
															+                            hidden_states,
														
 
															+                            hypo_ids,
														
 
															+                            points=point_per_piece / len(requested_backends),
														
 
															+                            backend=backend,
														
 
															+                            type="inference",
														
 
															+                        )
														
 
															                         (hidden_states,) = await backend.inference_pool.submit_task(
														
 
															-                            cache_metadata, hidden_states, hypo_ids
														
 
															+                            cache_metadata, hidden_states, hypo_ids, priority=priority
														
 
															                         )
														
 
															                     # serialize and send last layer outputs
														
@@ -123,8 +171,15 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         flat_inputs = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															         requested_uids = self._check_uids(request.uid)
														
 
															         requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															-
														
 
															-        hidden_states = await _rpc_forward(*flat_inputs, requested_backends=requested_backends)
														
 
															+        metadata = MSGPackSerializer.loads(request.metadata) if request.metadata else {}
														
 
															+        points = metadata.get("points", 0)
														
 
															+        assert isinstance(
														
 
															+            points, (float, int)
														
 
															+        ), f"rpc_forward should have number of points as number or None, got {points}"
														
 
															+
														
 
															+        hidden_states = await _rpc_forward(
														
 
															+            *flat_inputs, requested_backends=requested_backends, prioritizer=self._prioritizer, points=points
														
 
															+        )
														
 
															         assert isinstance(hidden_states, torch.Tensor) and hidden_states.ndim == 3
														
 
															         # Serialize output and respond to client
														
@@ -139,11 +194,17 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															     ) -> AsyncIterator[runtime_pb2.ExpertRequest]:
														
 
															         # Parse requests and prepare backends
														
 
															-        uid_str, flat_inputs = await self._gather_inputs(requests, context)
														
 
															+        uid_str, flat_inputs, metadata = await self._gather_inputs(requests, context)
														
 
															         requested_uids = self._check_uids(uid_str)
														
 
															         requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															+        points = metadata.get("points", 0)
														
 
															+        assert isinstance(
														
 
															+            points, (float, int)
														
 
															+        ), f"rpc_forward_stream should have number of points as number or None, got {points}"
														
 
															-        hidden_states = await _rpc_forward(*flat_inputs, requested_backends=requested_backends)
														
 
															+        hidden_states = await _rpc_forward(
														
 
															+            *flat_inputs, requested_backends=requested_backends, prioritizer=self._prioritizer, points=points
														
 
															+        )
														
 
															         assert isinstance(hidden_states, torch.Tensor) and hidden_states.ndim == 3, "hidden_states must be a 3d tensor"
														
 
															         # Serialize the overall output
														
@@ -164,8 +225,15 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         flat_tensors = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															         requested_uids = self._check_uids(request.uid)
														
 
															         requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															-
														
 
															-        grads = await _rpc_backward(*flat_tensors, requested_backends=requested_backends)
														
 
															+        metadata = MSGPackSerializer.loads(request.metadata) if request.metadata else {}
														
 
															+        points = metadata.get("points", 0)
														
 
															+        assert isinstance(
														
 
															+            points, (float, int)
														
 
															+        ), f"rpc_backward should have number of points as number or None, got {points}"
														
 
															+
														
 
															+        grads = await _rpc_backward(
														
 
															+            *flat_tensors, requested_backends=requested_backends, prioritizer=self._prioritizer, points=points
														
 
															+        )
														
 
															         # Modify grad_inputs_schema to support grad_prompts
														
 
															         assert len(requested_backends[0].args_schema) == 1 and len(grads) in (1, 2)  # TODO generalize
														
@@ -187,11 +255,17 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															     ) -> AsyncIterator[runtime_pb2.ExpertResponse]:
														
 
															-        uids_header, flat_tensors = await self._gather_inputs(requests, context)
														
 
															+        uids_header, flat_tensors, metadata = await self._gather_inputs(requests, context)
														
 
															         requested_uids = self._check_uids(uids_header)
														
 
															         requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															+        points = metadata.get("points", 0)
														
 
															+        assert isinstance(
														
 
															+            points, (float, int)
														
 
															+        ), f"rpc_backward_stream should have number of points as number or None, got {points}"
														
 
															-        grads = await _rpc_backward(*flat_tensors, requested_backends=requested_backends)
														
 
															+        grads = await _rpc_backward(
														
 
															+            *flat_tensors, requested_backends=requested_backends, prioritizer=self._prioritizer, points=points
														
 
															+        )
														
 
															         # Modify grad_inputs_schema to support grad_prompts
														
 
															         assert len(requested_backends[0].args_schema) == 1 and len(grads) in (1, 2)  # TODO generalize
														
@@ -244,7 +318,12 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															             yield handles
														
 
															-async def _rpc_forward(*flat_tensors: torch.Tensor, requested_backends: Sequence[TransformerBackend]) -> torch.Tensor:
														
 
															+async def _rpc_forward(
														
 
															+    *flat_tensors: torch.Tensor,
														
 
															+    requested_backends: Sequence[TransformerBackend],
														
 
															+    prioritizer: TaskPrioritizerBase,
														
 
															+    points: int = 0,
														
 
															+) -> torch.Tensor:
														
 
															     """
														
 
															     Run forward pass on deserialized inputs and prompts, used by rpc_forward and rpc_forward_stream
														
@@ -267,7 +346,15 @@ async def _rpc_forward(*flat_tensors: torch.Tensor, requested_backends: Sequence
 
															     for backend, prompt in zip(requested_backends, prompts):
														
 
															         if not is_dummy(prompt):
														
 
															             hidden_states[:, : prompt.shape[1]] += prompt
														
 
															-        (hidden_states,) = await backend.forward_pool.submit_task(hidden_states)
														
 
															+
														
 
															+        assert isinstance(backend.inference_pool, PrioritizedTaskPool), "petals support only prioritized pools"
														
 
															+        priority = prioritizer.prioritize(
														
 
															+            hidden_states, points=points / len(requested_backends), backend=backend, type="forward"
														
 
															+        )
														
 
															+        (hidden_states,) = await backend.forward_pool.submit_task(
														
 
															+            hidden_states,
														
 
															+            priority=priority,
														
 
															+        )
														
 
															         assert isinstance(hidden_states, torch.Tensor)
														
 
															         assert (
														
 
															             hidden_states.ndim == 3
														
@@ -278,7 +365,10 @@ async def _rpc_forward(*flat_tensors: torch.Tensor, requested_backends: Sequence
 
															 async def _rpc_backward(
														
 
															-    *flat_tensors: torch.Tensor, requested_backends: Sequence[TransformerBackend]
														
 
															+    *flat_tensors: torch.Tensor,
														
 
															+    requested_backends: Sequence[TransformerBackend],
														
 
															+    prioritizer: TaskPrioritizerBase,
														
 
															+    points: int = 0,
														
 
															 ) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
														
 
															     inputs, grad_outputs, prompts = flat_tensors
														
 
															     # Cast inputs & grad outputs to backend dtype
														
@@ -298,7 +388,12 @@ async def _rpc_backward(
 
															         if not is_dummy(prompt):
														
 
															             inputs[:, : prompt.shape[1]] += prompt
														
 
															         inter_inputs.append(inputs)
														
 
															-        (inputs,) = await backend.forward_pool.submit_task(inputs)
														
 
															+        assert isinstance(backend.inference_pool, PrioritizedTaskPool), "petals support only prioritized pools"
														
 
															+        priority = prioritizer.prioritize(
														
 
															+            inputs, points=points / len(requested_backends), backend=backend, type="forward_in_backward"
														
 
															+        )
														
 
															+        (inputs,) = await backend.forward_pool.submit_task(inputs, priority=priority)
														
 
															+
														
 
															         assert isinstance(inputs, torch.Tensor)
														
 
															     if not is_dummy(prompts[-1]):
														
@@ -309,7 +404,12 @@ async def _rpc_backward(
 
															     grad_prompts_reversed = []
														
 
															     # Run a chain of requested backends
														
 
															     for inp, prompt, backend in zip(*map(reversed, (inter_inputs, prompts, requested_backends))):
														
 
															-        (grad_outputs,) = await backend.backward_pool.submit_task(inp, grad_outputs)
														
 
															+        assert isinstance(backend.inference_pool, PrioritizedTaskPool), "petals support only prioritized pools"
														
 
															+        priority = prioritizer.prioritize(
														
 
															+            inp, grad_outputs, points=points / len(requested_backends), backend=backend, type="backward"
														
 
															+        )
														
 
															+        (grad_outputs,) = await backend.backward_pool.submit_task(inp, grad_outputs, priority=priority)
														
 
															+
														
 
															         assert isinstance(grad_outputs, torch.Tensor)
														
 
															         if not is_dummy(prompt):
														
 
															             grad_prompts_reversed.append(grad_outputs[:, : prompt.shape[1]].unsqueeze(0))
														
--- a/src/server/runtime.py
+++ b/src/server/runtime.py
@@ -0,0 +1,198 @@
 
															+import multiprocessing as mp
														
 
															+import multiprocessing.pool
														
 
															+import threading
														
 
															+from collections import defaultdict
														
 
															+from itertools import chain
														
 
															+from queue import SimpleQueue
														
 
															+from selectors import EVENT_READ, DefaultSelector
														
 
															+from statistics import mean
														
 
															+from time import time
														
 
															+from typing import Dict, NamedTuple, Optional
														
 
															+
														
 
															+import torch
														
 
															+from hivemind.moe.server.module_backend import ModuleBackend
														
 
															+from hivemind.utils import get_logger
														
 
															+from prefetch_generator import BackgroundGenerator
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+
														
 
															+class Runtime(threading.Thread):
														
 
															+    """
														
 
															+    A group of processes that processes incoming requests for multiple module backends on a shared device.
														
 
															+    Runtime is usually created and managed by Server, humans need not apply.
														
 
															+
														
 
															+    For debugging, you can start runtime manually with .start() or .run()
														
 
															+
														
 
															+    >>> module_backends = {'block_uid': ModuleBackend(**kwargs)}
														
 
															+    >>> runtime = Runtime(module_backends)
														
 
															+    >>> runtime.start()  # start runtime in background thread. To start in current thread, use runtime.run()
														
 
															+    >>> runtime.ready.wait()  # await for runtime to load all blocks on device and create request pools
														
 
															+    >>> future = runtime.module_backends['block_uid'].forward_pool.submit_task(*module_inputs)
														
 
															+    >>> print("Returned:", future.result())
														
 
															+    >>> runtime.shutdown()
														
 
															+
														
 
															+    :param module_backends: a dict [block uid -> ModuleBackend]
														
 
															+    :param prefetch_batches: form up to this many batches in advance
														
 
															+    :param sender_threads: dispatches outputs from finished batches using this many asynchronous threads
														
 
															+    :param device: if specified, moves all blocks and data to this device via .to(device=device).
														
 
															+      If you want to manually specify devices for each block (in their forward pass), leave device=None (default)
														
 
															+
														
 
															+    :param stats_report_interval: interval to collect and log statistics about runtime performance
														
 
															+    """
														
 
															+
														
 
															+    SHUTDOWN_TRIGGER = "RUNTIME SHUTDOWN TRIGGERED"
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        module_backends: Dict[str, ModuleBackend],
														
 
															+        prefetch_batches: int = 1,
														
 
															+        sender_threads: int = 1,
														
 
															+        device: torch.device = None,
														
 
															+        stats_report_interval: Optional[int] = None,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+        self.module_backends = module_backends
														
 
															+        self.pools = tuple(chain(*(backend.get_pools() for backend in module_backends.values())))
														
 
															+        self.device, self.prefetch_batches, self.sender_threads = device, prefetch_batches, sender_threads
														
 
															+        self.shutdown_recv, self.shutdown_send = mp.Pipe(duplex=False)
														
 
															+        self.shutdown_trigger = mp.Event()
														
 
															+        self.ready = mp.Event()  # event is set iff server is currently running and ready to accept batches
														
 
															+
														
 
															+        self.stats_report_interval = stats_report_interval
														
 
															+        if self.stats_report_interval is not None:
														
 
															+            self.stats_reporter = StatsReporter(self.stats_report_interval)
														
 
															+
														
 
															+    def run(self):
														
 
															+        for pool in self.pools:
														
 
															+            if not pool.is_alive():
														
 
															+                pool.start()
														
 
															+        if self.device is not None:
														
 
															+            for backend in self.module_backends.values():
														
 
															+                backend.module.to(self.device)
														
 
															+
														
 
															+        with mp.pool.ThreadPool(self.sender_threads) as output_sender_pool:
														
 
															+            try:
														
 
															+                self.ready.set()
														
 
															+                if self.stats_report_interval is not None:
														
 
															+                    self.stats_reporter.start()
														
 
															+                logger.info("Started")
														
 
															+
														
 
															+                batch_iterator = self.iterate_minibatches_from_pools()
														
 
															+                if self.prefetch_batches > 0:
														
 
															+                    batch_iterator = BackgroundGenerator(batch_iterator, self.prefetch_batches)
														
 
															+
														
 
															+                for pool, batch_index, batch in batch_iterator:
														
 
															+                    logger.debug(f"Processing batch {batch_index} from pool {pool.name}")
														
 
															+
														
 
															+                    start = time()
														
 
															+                    try:
														
 
															+                        outputs = pool.process_func(*batch)
														
 
															+                        output_sender_pool.apply_async(pool.send_outputs_from_runtime, args=[batch_index, outputs])
														
 
															+
														
 
															+                        batch_processing_time = time() - start
														
 
															+
														
 
															+                        batch_size = outputs[0].size(0)
														
 
															+                        logger.debug(f"Pool {pool.name}: batch {batch_index} processed, size {batch_size}")
														
 
															+
														
 
															+                        if self.stats_report_interval is not None:
														
 
															+                            self.stats_reporter.report_stats(pool.name, batch_size, batch_processing_time)
														
 
															+
														
 
															+                    except KeyboardInterrupt:
														
 
															+                        raise
														
 
															+                    except BaseException as exception:
														
 
															+                        logger.exception(f"Caught {exception}, attempting to recover")
														
 
															+                        output_sender_pool.apply_async(pool.send_exception_from_runtime, args=[batch_index, exception])
														
 
															+
														
 
															+            finally:
														
 
															+                if not self.shutdown_trigger.is_set():
														
 
															+                    self.shutdown()
														
 
															+
														
 
															+    def shutdown(self):
														
 
															+        """Gracefully terminate a running runtime."""
														
 
															+        logger.info("Shutting down")
														
 
															+        self.ready.clear()
														
 
															+
														
 
															+        if self.stats_report_interval is not None:
														
 
															+            self.stats_reporter.stop.set()
														
 
															+            self.stats_reporter.join()
														
 
															+
														
 
															+        logger.debug("Terminating pools")
														
 
															+        for pool in self.pools:
														
 
															+            if pool.is_alive():
														
 
															+                pool.shutdown()
														
 
															+        logger.debug("Pools terminated")
														
 
															+
														
 
															+        # trigger background thread to shutdown
														
 
															+        self.shutdown_send.send(self.SHUTDOWN_TRIGGER)
														
 
															+        self.shutdown_trigger.set()
														
 
															+
														
 
															+    def iterate_minibatches_from_pools(self, timeout=None):
														
 
															+        """
														
 
															+        Chooses pool according to priority, then copies exposed batch and frees the buffer
														
 
															+        """
														
 
															+        with DefaultSelector() as selector:
														
 
															+            for pool in self.pools:
														
 
															+                selector.register(pool.batch_receiver, EVENT_READ, pool)
														
 
															+            selector.register(self.shutdown_recv, EVENT_READ, self.SHUTDOWN_TRIGGER)
														
 
															+
														
 
															+            while True:
														
 
															+                # wait until at least one batch_receiver becomes available
														
 
															+                logger.debug("Waiting for inputs from task pools")
														
 
															+                ready_fds = selector.select()
														
 
															+                ready_objects = {key.data for (key, events) in ready_fds}
														
 
															+                if self.SHUTDOWN_TRIGGER in ready_objects:
														
 
															+                    break  # someone asked us to shutdown, break from the loop
														
 
															+
														
 
															+                logger.debug("Choosing the pool with first priority")
														
 
															+
														
 
															+                pool = min(ready_objects, key=lambda pool: pool.priority)
														
 
															+
														
 
															+                logger.debug(f"Loading batch from {pool.name}")
														
 
															+                batch_index, batch_tensors = pool.load_batch_to_runtime(timeout, self.device)
														
 
															+                logger.debug(f"Loaded batch from {pool.name}")
														
 
															+                yield pool, batch_index, batch_tensors
														
 
															+
														
 
															+
														
 
															+BatchStats = NamedTuple("BatchStats", (("batch_size", int), ("processing_time", float)))
														
 
															+
														
 
															+
														
 
															+class StatsReporter(threading.Thread):
														
 
															+    def __init__(self, report_interval: int):
														
 
															+        super().__init__()
														
 
															+        self.report_interval = report_interval
														
 
															+        self.stop = threading.Event()
														
 
															+        self.stats_queue = SimpleQueue()
														
 
															+
														
 
															+    def run(self):
														
 
															+        while not self.stop.wait(self.report_interval):
														
 
															+            pool_batch_stats = defaultdict(list)
														
 
															+            while not self.stats_queue.empty():
														
 
															+                pool_uid, batch_stats = self.stats_queue.get()
														
 
															+                pool_batch_stats[pool_uid].append(batch_stats)
														
 
															+
														
 
															+            total_processed_batches = sum(len(pool_stats) for pool_stats in pool_batch_stats.values())
														
 
															+            logger.info(f"Processed {total_processed_batches} batches in last {self.report_interval} seconds:")
														
 
															+            for pool_uid, pool_stats in pool_batch_stats.items():
														
 
															+                total_batches = len(pool_stats)
														
 
															+                total_examples = sum(batch_stats.batch_size for batch_stats in pool_stats)
														
 
															+                avg_batch_size = mean(batch_stats.batch_size for batch_stats in pool_stats)
														
 
															+                total_time = sum(batch_stats.processing_time for batch_stats in pool_stats)
														
 
															+                batches_to_time = total_batches / total_time
														
 
															+                batch_performance = f"{batches_to_time:.2f} " + ("batches/s" if batches_to_time > 1 else "s/batch")
														
 
															+
														
 
															+                examples_to_time = total_examples / total_time
														
 
															+                example_performance = f"{examples_to_time:.2f} " + (
														
 
															+                    "examples/s" if examples_to_time > 1 else "s/example"
														
 
															+                )
														
 
															+
														
 
															+                logger.info(
														
 
															+                    f"{pool_uid}: "
														
 
															+                    f"{total_batches} batches ({batch_performance}), "
														
 
															+                    f"{total_examples} examples ({example_performance}), "
														
 
															+                    f"avg batch size {avg_batch_size:.2f}"
														
 
															+                )
														
 
															+
														
 
															+    def report_stats(self, pool_uid, batch_size, processing_time):
														
 
															+        batch_stats = BatchStats(batch_size, processing_time)
														
 
															+        self.stats_queue.put_nowait((pool_uid, batch_stats))
														
--- a/src/server/server.py
+++ b/src/server/server.py
@@ -71,9 +71,9 @@ class Server(threading.Thread):
 
															         runs Runtime (self.runtime) to process incoming requests.
														
 
															         """
														
 
															         logger.info(f"Serving {len(self.module_backends)} blocks:")
														
 
															-        for expert_name, backend in self.module_backends.items():
														
 
															+        for block_name, backend in self.module_backends.items():
														
 
															             num_parameters = sum(p.numel() for p in backend.module.parameters() if p.requires_grad)
														
 
															-            logger.info(f"{expert_name}: {backend.module.__class__.__name__}, {num_parameters} parameters")
														
 
															+            logger.info(f"{block_name}: {backend.module.__class__.__name__}, {num_parameters} parameters")
														
 
															         if not self.dht.is_alive():
														
 
															             self.dht.run_in_background(await_ready=True)
														
@@ -118,6 +118,8 @@ class Server(threading.Thread):
 
															         custom_module_path=None,
														
 
															         update_period: float = 30,
														
 
															         expiration: Optional[float] = None,
														
 
															+        prefetch_batches: int = 1,
														
 
															+        sender_threads: int = 1,
														
 
															         max_block_selection_delay: float = 1,
														
 
															         use_auth_token: Optional[str] = None,
														
 
															         load_in_8bit: bool = False,
														
@@ -236,6 +238,8 @@ class Server(threading.Thread):
 
															             stats_report_interval=stats_report_interval,
														
 
															             update_period=update_period,
														
 
															             expiration=expiration,
														
 
															+            prefetch_batches=prefetch_batches,
														
 
															+            sender_threads=sender_threads,
														
 
															             start=start,
														
 
															         )
														
--- a/src/server/task_pool.py
+++ b/src/server/task_pool.py
@@ -0,0 +1,175 @@
 
															+import ctypes
														
 
															+import multiprocessing as mp
														
 
															+import threading
														
 
															+import time
														
 
															+from dataclasses import dataclass, field
														
 
															+from queue import PriorityQueue
														
 
															+from typing import Any, Generator, List, Optional, Sequence, Tuple
														
 
															+
														
 
															+import torch
														
 
															+from hivemind import MPFuture, get_logger, use_hivemind_log_handler
														
 
															+from hivemind.moe.server.task_pool import TaskPoolBase
														
 
															+
														
 
															+use_hivemind_log_handler("in_root_logger")
														
 
															+logger = get_logger(__file__)
														
 
															+
														
 
															+
														
 
															+@dataclass(order=True, frozen=True)
														
 
															+class Task:
														
 
															+    priority: float
														
 
															+    time_submitted: float
														
 
															+    future: MPFuture = field(compare=False)
														
 
															+    args: Sequence[torch.Tensor] = field(compare=False)
														
 
															+
														
 
															+    @property
														
 
															+    def uid(self) -> int:
														
 
															+        return self.future._uid
														
 
															+
														
 
															+
														
 
															+class PrioritizedTaskPool(TaskPoolBase):
														
 
															+    """
														
 
															+    Aggregates requests from multiple ConnectionHandler instances, orders them for processing in Runtime, then
														
 
															+    returns results (or exception) to the corresponding ConnectionHandler. Runs a background process.
														
 
															+    A single PrioritizedTaskPool services a specific function (e.g. layer1.forward, layer2.forward or layer1.backward)
														
 
															+
														
 
															+    :note: unlike hivemind.moe TaskPool, this pool does *not* combine incoming requests into batches.
														
 
															+      This would require grouping requests of different length.
														
 
															+
														
 
															+    :param process_func: function to be applied to every formed batch; called by Runtime
														
 
															+        Note that process_func should accept only positional args (Tensors) and return a flat tuple of Tensors
														
 
															+    :param max_batch_size: process at most this many inputs in a batch (task contains have one or several inputs)
														
 
															+         Measured in the total number of tokens (i.e. batch size * sequence length)
														
 
															+
														
 
															+    :param name: pool name, used for logging
														
 
															+    :param min_batch_size: process at least this many inputs in a batch, otherwise wait for more
														
 
															+    :param start: if True, start automatically at the end of __init__
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        process_func: callable,
														
 
															+        max_batch_size: int,
														
 
															+        name: str,
														
 
															+        min_batch_size=1,
														
 
															+        daemon=True,
														
 
															+        start=False,
														
 
															+    ):
														
 
															+        super().__init__(process_func, daemon=daemon, name=name)
														
 
															+        self.min_batch_size, self.max_batch_size = min_batch_size, max_batch_size
														
 
															+
														
 
															+        self.submitted_tasks = mp.SimpleQueue()  # interaction with ConnectionHandlers
														
 
															+        self._ordered_tasks = PriorityQueue()  # interaction with Runtime - only valid inside Runtime
														
 
															+
														
 
															+        self._prioritizer_thread = threading.Thread(
														
 
															+            name=self.name + "_prioritizer",
														
 
															+            target=self._prioritize_tasks,
														
 
															+            args=[self.submitted_tasks, self._ordered_tasks],
														
 
															+            daemon=True,
														
 
															+        )
														
 
															+        self._dispatched_tasks = {}
														
 
															+        self.batch_receiver, self.batch_sender = mp.Pipe(duplex=False)
														
 
															+        self._oldest_undispatched_timestamp = mp.Value(ctypes.c_double, 1.0)
														
 
															+        self.priority = float("inf"), float("inf")  # (first task priority, first task timestamp)
														
 
															+        if start:
														
 
															+            self.start()
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _prioritize_tasks(submitted_tasks: mp.SimpleQueue, ordered_tasks: PriorityQueue):
														
 
															+        """Read tasks from incoming queue and put them into a local priority queue"""
														
 
															+        while True:
														
 
															+            task = submitted_tasks.get()
														
 
															+            if task is None:
														
 
															+                logger.debug("Shutting down prioritizer thread")
														
 
															+                break
														
 
															+
														
 
															+            ordered_tasks.put(task, block=True)
														
 
															+
														
 
															+    def start(self):
														
 
															+        assert not self.is_alive() and not self._prioritizer_thread.is_alive()
														
 
															+        self._prioritizer_thread.start()
														
 
															+        super().start()
														
 
															+
														
 
															+    def shutdown(self, timeout: Optional[float] = None):
														
 
															+        self.submitted_tasks.put(None)
														
 
															+        self.terminate()
														
 
															+        self._prioritizer_thread.join(timeout)
														
 
															+
														
 
															+    def submit_task(self, *args: torch.Tensor, priority: float = 0.0) -> MPFuture:
														
 
															+        """Add task to this pool's queue, return Future for its output"""
														
 
															+        task = Task(priority, time.monotonic(), MPFuture(), args)
														
 
															+        if self.get_task_size(task) > self.max_batch_size:
														
 
															+            exc = ValueError(f"Task size greater than max_batch_size ({self.max_batch_size}), it can't be processed")
														
 
															+            task.future.set_exception(exc)
														
 
															+        else:
														
 
															+            self.submitted_tasks.put(task)
														
 
															+            self.batch_sender.send(None)  # use this pipe to count the number of unfinished batches
														
 
															+            if (task.priority, task.time_submitted) < self.priority:
														
 
															+                self.priority = (task.priority, task.time_submitted)
														
 
															+        return task.future
														
 
															+
														
 
															+    def get_task_size(self, task: Task) -> int:
														
 
															+        """compute task processing complexity; defaults to the total number of tokens"""
														
 
															+        if task.args and task.args[0].ndim >= 2:
														
 
															+            return task.args[0].shape[0] * task.args[0].shape[1]
														
 
															+        return 1
														
 
															+
														
 
															+    def load_batch_to_runtime(
														
 
															+        self, timeout: Optional[float] = None, device: Optional[torch.device] = None
														
 
															+    ) -> Tuple[Any, List[torch.Tensor]]:
														
 
															+        """receive next batch of arrays"""
														
 
															+        task = self._ordered_tasks.get(block=True, timeout=timeout)
														
 
															+        batch_inputs = [
														
 
															+            tensor.detach().to(device, non_blocking=True).requires_grad_(tensor.requires_grad) for tensor in task.args
														
 
															+        ]
														
 
															+        self._dispatched_tasks[task.uid] = task
														
 
															+        self.batch_receiver.recv()  # reduce the number of active batches
														
 
															+        if not self._ordered_tasks.empty():
														
 
															+            first_remaining_task: Task = self._ordered_tasks.queue[0]
														
 
															+            self.priority = (first_remaining_task.priority, first_remaining_task.time_submitted)
														
 
															+        return task.uid, batch_inputs
														
 
															+
														
 
															+    def send_outputs_from_runtime(self, uid: int, batch_outputs: List[torch.Tensor]):
														
 
															+        """send results for a processed batch, previously loaded through load_batch_to_runtime"""
														
 
															+        batch_outputs = [
														
 
															+            tensor.to(device="cpu").share_memory_().detach().requires_grad_(tensor.requires_grad)
														
 
															+            for tensor in batch_outputs
														
 
															+        ]
														
 
															+
														
 
															+        task = self._dispatched_tasks.pop(uid, None)
														
 
															+        if task is None:
														
 
															+            logger.error(
														
 
															+                f"Internal error: task task with index {uid} is missing from the dictionary; " f"Could not set result"
														
 
															+            )
														
 
															+        else:
														
 
															+            task.future.set_result(batch_outputs)
														
 
															+
														
 
															+    def send_exception_from_runtime(self, uid: int, exception: BaseException):
														
 
															+        task = self._dispatched_tasks.pop(uid, None)
														
 
															+        if task is None:
														
 
															+            logger.error(
														
 
															+                f"Internal error: task task with index {uid} is missing from the dictionary; "
														
 
															+                f"Could not set exception {exception}"
														
 
															+            )
														
 
															+        else:
														
 
															+            task.future.set_exception(exception)
														
 
															+
														
 
															+    def run(self, *args, **kwargs):
														
 
															+        mp.Event().wait()
														
 
															+
														
 
															+    @property
														
 
															+    def empty(self):
														
 
															+        return not self.batch_receiver.poll()
														
 
															+
														
 
															+    @property
														
 
															+    def priority(self) -> Tuple[float, float]:
														
 
															+        """The priority of this pool equals the (priority, timestamp) of the most important task in it."""
														
 
															+        return float(self._priority.value), float(self._oldest_undispatched_timestamp.value)
														
 
															+
														
 
															+    @priority.setter
														
 
															+    def priority(self, item: Tuple[float, float]):
														
 
															+        assert len(item) == 2
														
 
															+        self._priority.value = float(item[0])
														
 
															+        self._oldest_undispatched_timestamp.value = float(item[1])
														
 
															+
														
 
															+    def iterate_minibatches(self, *args, **kwargs) -> Generator[List[Task], None, None]:
														
 
															+        raise NotImplementedError()
														
--- a/src/server/task_prioritizer.py
+++ b/src/server/task_prioritizer.py
@@ -0,0 +1,20 @@
 
															+from abc import ABC, abstractmethod
														
 
															+
														
 
															+import torch
														
 
															+from hivemind.moe.server.task_pool import Task
														
 
															+
														
 
															+
														
 
															+class TaskPrioritizerBase(ABC):
														
 
															+    """Abstract class for TaskPrioritizer whose reponsibility is to evaluate task priority"""
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def prioritize(self, *input: torch.Tensor, points: float = 0.0, **kwargs) -> float:
														
 
															+        """Evaluates task value by the amout of points given, task input and additional kwargs. Lower priority is better"""
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+class DummyTaskPrioritizer(TaskPrioritizerBase):
														
 
															+    """Simple implementation of TaskPrioritizer which gives constant zero priority for every task"""
														
 
															+
														
 
															+    def prioritize(self, *input: torch.Tensor, points: float = 0.0, **kwargs) -> float:
														
 
															+        return 0.0
														
--- a/tests/test_priority_pool.py
+++ b/tests/test_priority_pool.py
@@ -0,0 +1,71 @@
 
															+import multiprocessing as mp
														
 
															+import time
														
 
															+
														
 
															+import pytest
														
 
															+import torch
														
 
															+
														
 
															+from src.server.runtime import Runtime
														
 
															+from src.server.task_pool import PrioritizedTaskPool
														
 
															+
														
 
															+
														
 
															+@pytest.mark.forked
														
 
															+def test_priority_pools():
														
 
															+    outputs_queue = mp.SimpleQueue()
														
 
															+    results_valid = mp.Event()
														
 
															+
														
 
															+    def dummy_pool_func(x):
														
 
															+        time.sleep(0.1)
														
 
															+        y = x**2
														
 
															+        outputs_queue.put((x, y))
														
 
															+        return (y,)
														
 
															+
														
 
															+    class DummyBackend:
														
 
															+        def __init__(self, pools):
														
 
															+            self.pools = pools
														
 
															+
														
 
															+        def get_pools(self):
														
 
															+            return self.pools
														
 
															+
														
 
															+    pools = (
														
 
															+        PrioritizedTaskPool(dummy_pool_func, name="A", max_batch_size=1),
														
 
															+        PrioritizedTaskPool(dummy_pool_func, name="B", max_batch_size=1),
														
 
															+    )
														
 
															+
														
 
															+    runtime = Runtime({str(i): DummyBackend([pool]) for i, pool in enumerate(pools)}, prefetch_batches=0)
														
 
															+    runtime.start()
														
 
															+
														
 
															+    def process_tasks():
														
 
															+        futures = []
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([0]), priority=1))
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([1]), priority=1))
														
 
															+        time.sleep(0.01)
														
 
															+        futures.append(pools[1].submit_task(torch.tensor([2]), priority=1))
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([3]), priority=2))
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([4]), priority=10))
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([5]), priority=0))
														
 
															+        futures.append(pools[0].submit_task(torch.tensor([6]), priority=1))
														
 
															+        futures.append(pools[1].submit_task(torch.tensor([7]), priority=11))
														
 
															+        futures.append(pools[1].submit_task(torch.tensor([8]), priority=1))
														
 
															+        for i, f in enumerate(futures):
														
 
															+            assert f.result()[0].item() == i**2
														
 
															+        results_valid.set()
														
 
															+
														
 
															+    proc = mp.Process(target=process_tasks)
														
 
															+    proc.start()
														
 
															+    proc.join()
														
 
															+    assert results_valid.is_set()
														
 
															+
														
 
															+    ordered_outputs = []
														
 
															+    while not outputs_queue.empty():
														
 
															+        ordered_outputs.append(outputs_queue.get()[0].item())
														
 
															+
														
 
															+    assert ordered_outputs == [0, 5, 1, 2, 6, 8, 3, 4, 7]
														
 
															+    #                          0 - first batch is loaded immediately, before everything else
														
 
															+    #                             5 - highest priority task overall
														
 
															+    #                                1 - first of several tasks with equal lowest priority (1)
														
 
															+    #                                   2 - second earliest task with priority 1, fetched from pool B
														
 
															+    #                                      6 - third earliest task with priority 1, fetched from pool A again
														
 
															+    #                                         8 - last priority-1 task, pool B
														
 
															+    #                                            3 - task with priority 2 from pool A
														
 
															+    #                                               4 - task with priority 10 from pool A
														
 
															+    #                                                  7 - task with priority 11 from pool B