%!s(int64=3) %!d(string=hai) anos · 1cdf8a77fb
--- a/README.md
+++ b/README.md
@@ -51,8 +51,9 @@ import torch
 
															 import hivemind
														
 
															 from src import get_remote_module
														
 
															+
														
 
															 dht = hivemind.DHT(
														
 
															-    initial_peers=["/ip4/127.0.0.1/COPY_FULL_ADDRESS_FROM_ANY_OF_THE_SERVERS"],
														
 
															+    initial_peers=[TODO_COPY_FULL_ADDRESS_FROM_ANY_OF_THE_SERVERS],  # e.g. /ip4/127.0.0.1/...
														
 
															     client_mode=True, start=True,
														
 
															 )
														
--- a/src/server/cache.py
+++ b/src/server/cache.py
@@ -8,7 +8,7 @@ import contextlib
 
															 import ctypes
														
 
															 import multiprocessing as mp
														
 
															 import os
														
 
															-from typing import Dict, Optional, Union
														
 
															+from typing import Dict, Optional, Union, AsyncContextManager
														
 
															 import hivemind
														
 
															 import torch
														
@@ -54,7 +54,7 @@ class MemoryCache:
 
															         self._handle_counter.value = value
														
 
															     @contextlib.asynccontextmanager
														
 
															-    async def allocate_cache(self, descr: TensorDescriptor) -> Handle:
														
 
															+    async def allocate_cache(self, descr: TensorDescriptor) -> AsyncContextManager[Handle]:
														
 
															         """
														
 
															         Create a handle that is associated with buffers on unique device. If cache full, raises AllocationFailed.
														
--- a/src/server/handler.py
+++ b/src/server/handler.py
@@ -1,21 +1,24 @@
 
															-from typing import AsyncIterator, Dict
														
 
															+import contextlib
														
 
															+from typing import AsyncIterator, Dict, Sequence
														
 
															 import torch
														
 
															-from hivemind import DHT, P2PContext, TensorDescriptor, deserialize_torch_tensor, nested_flatten
														
 
															+from hivemind import DHT, P2PContext, TensorDescriptor, deserialize_torch_tensor, nested_flatten, serialize_torch_tensor
														
 
															 from hivemind.moe.server.connection_handler import ConnectionHandler
														
 
															 from hivemind.proto import runtime_pb2
														
 
															 from hivemind.utils.asyncio import anext
														
 
															+from src.data_structures import CHAIN_DELIMITER, ModuleUID
														
 
															 from src.server.backend import MAX_LENGTH, TransformerBackend
														
 
															 class TransformerConnectionHandler(ConnectionHandler):
														
 
															     """Handles three request types: forward, backward and forward-incremental (inference)"""
														
 
															+    module_backends: Dict[ModuleUID, TransformerBackend]
														
 
															     def __init__(self, dht: DHT, module_backends: Dict[str, TransformerBackend]):
														
 
															-        for module_backend in module_backends.values():
														
 
															-            assert isinstance(module_backend, TransformerBackend)
														
 
															         super().__init__(dht, module_backends)
														
 
															+        for module_backend in self.module_backends.values():
														
 
															+            assert isinstance(module_backend, TransformerBackend)
														
 
															     async def rpc_inference(
														
 
															         self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
@@ -24,28 +27,69 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         try:
														
 
															             print("OPENED RPC_INFERENCE")
														
 
															             request = await anext(requests)
														
 
															-            if not request.uid:
														
 
															-                raise RuntimeError("User did not provide any uids.")
														
 
															-            backend = self.module_backends[request.uid]
														
 
															-            assert isinstance(backend, TransformerBackend)
														
 
															-
														
 
															-            # prepare attention cache
														
 
															-            num_heads = backend.module.self_attention.num_heads
														
 
															-            head_dim = backend.module.self_attention.head_dim
														
 
															+            requested_uids = self._check_header(request)
														
 
															+            requested_backends = tuple(self.module_backends[uid] for uid in requested_uids)
														
 
															+
														
 
															             cache_metadata = torch.tensor([[-1, -1]], dtype=torch.int64)  # [cache_handle, prefix_length]
														
 
															-            cache_descriptor = TensorDescriptor(size=(2, 1, MAX_LENGTH, num_heads, head_dim), dtype=torch.float32)
														
 
															             prefix_length = 0
														
 
															-            async with backend.memory_cache.allocate_cache(cache_descriptor) as cache_handle:
														
 
															-                while request.uid or request.tensors:  # iterate while user is willing to supply tensors
														
 
															-                    inputs = [cache_metadata, *(deserialize_torch_tensor(tensor) for tensor in request.tensors)]
														
 
															-                    print("INPUTS:", inputs)
														
 
															-                    assert len(inputs) == 2 and inputs[1].ndim == 3, "send only hidden states for now"
														
 
															-                    cache_metadata[0, 0], cache_metadata[0, 1] = cache_handle, prefix_length
														
 
															-                    outputs = await self._process_inputs(inputs, backend.inference_pool, backend.outputs_schema)
														
 
															-                    yield runtime_pb2.ExpertResponse(tensors=outputs)
														
 
															+            async with self._allocate_caches(requested_backends) as cache_handles:
														
 
															+                assert len(cache_handles) == len(requested_backends)
														
 
															+                while request.tensors:  # iterate while user is willing to supply tensors
														
 
															+                    hidden_states = [deserialize_torch_tensor(tensor) for tensor in request.tensors]
														
 
															+
														
 
															+                    # run request tensors through all requested modules, update caches
														
 
															+                    for backend, cache_handle in zip(requested_backends, cache_handles):
														
 
															+                        cache_metadata[0, 0], cache_metadata[0, 1] = cache_handle, prefix_length
														
 
															+                        assert len(hidden_states) == 1 and hidden_states[0].ndim == 3, \
														
 
															+                            f"inputs to {type(backend)} must be a list with a single 3d tensor of hidden states"
														
 
															+
														
 
															+                        hidden_states = await backend.inference_pool.submit_task(cache_metadata, *hidden_states)
														
 
															+                        assert isinstance(hidden_states, (list, tuple))
														
 
															+                        assert len(hidden_states) == 1 and hidden_states[0].ndim == 3
														
 
															-                    prefix_length += inputs[1].shape[1]
														
 
															+                    # serialize and send last layer outputs
														
 
															+                    yield runtime_pb2.ExpertResponse(tensors=[
														
 
															+                        serialize_torch_tensor(result, proto.compression, allow_inplace=True)
														
 
															+                        for result, proto in zip(hidden_states, nested_flatten(requested_backends[-1].outputs_schema))
														
 
															+                    ])
														
 
															+
														
 
															+                    # prepare for next step
														
 
															+                    prefix_length += hidden_states[0].shape[1]
														
 
															                     request = await (anext(requests))
														
 
															         finally:
														
 
															             print("CLOSED RPC_INFERENCE")
														
 
															+
														
 
															+    def _check_header(self, request: runtime_pb2.ExpertRequest) -> Sequence[ModuleUID]:
														
 
															+        """Check that the first request to rpc_inference is valid"""
														
 
															+        uids = (request.uid or '').split(CHAIN_DELIMITER)
														
 
															+        if not uids:
														
 
															+            raise RuntimeError("User did not provide any uids")
														
 
															+        for uid in uids:
														
 
															+            if uid not in self.module_backends:
														
 
															+                raise RuntimeError(f"Remote peer does not serve {uid}")
														
 
															+        return tuple(uids)
														
 
															+
														
 
															+    @contextlib.asynccontextmanager
														
 
															+    async def _allocate_caches(self, backends: Sequence[TransformerBackend]) -> Sequence[int]:
														
 
															+        """Allocate memory caches for each transformer block, return cache handles"""
														
 
															+        async with contextlib.AsyncExitStack() as stack:
														
 
															+            handles = []
														
 
															+            for backend in backends:
														
 
															+                num_heads = backend.module.self_attention.num_heads
														
 
															+                head_dim = backend.module.self_attention.head_dim
														
 
															+
														
 
															+                cache_descriptor = TensorDescriptor(size=(2, 1, MAX_LENGTH, num_heads, head_dim), dtype=torch.float32)
														
 
															+                # [key_or_value, batch_size, max_length, num_heads, head_dim]
														
 
															+
														
 
															+                handles.append(await stack.enter_async_context(backend.memory_cache.allocate_cache(cache_descriptor)))
														
 
															+
														
 
															+            yield handles
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/tests/test_chained_inference.py
+++ b/tests/test_chained_inference.py
@@ -0,0 +1,64 @@
 
															+######
														
 
															+# Warning:torch this test is a work in progress. It will be modified soon.
														
 
															+# - if you want more stable tests, see test_block_exact_match
														
 
															+# - if you want to figure out chained inference, ask yozh
														
 
															+
														
 
															+import os
														
 
															+
														
 
															+import hivemind
														
 
															+import torch
														
 
															+from hivemind.moe.expert_uid import ExpertInfo
														
 
															+
														
 
															+from src.bloom.from_pretrained import load_pretrained_block
														
 
															+from src.client.remote_block import RemoteTransformerBlock
														
 
															+from src.dht_utils import get_remote_module
														
 
															+
														
 
															+INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
														
 
															+if not INITIAL_PEERS:
														
 
															+    raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
														
 
															+INITIAL_PEERS = INITIAL_PEERS.split()
														
 
															+
														
 
															+
														
 
															+BLOCK_UID = os.environ.get("BLOCK_UID")
														
 
															+if not BLOCK_UID:
														
 
															+    raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
														
 
															+
														
 
															+REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
														
 
															+REF_INDEX = int(os.environ.get("REF_INDEX", BLOCK_UID[-1].split(".")[-1]))
														
 
															+
														
 
															+
														
 
															+def test_remote_block_exact_match(atol_inference=1e-4):
														
 
															+    dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															+    remote_block = get_remote_module(dht, BLOCK_UID)
														
 
															+    assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
														
 
															+    assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															+
														
 
															+    _ = remote_block.info  # lazy-init info now, because otherwise we will _break_ info init by chaning _info
														
 
															+    remote_block._info = ExpertInfo('bloom6b3.3 bloom6b3.4', remote_block._info.peer_id)
														
 
															+
														
 
															+    inputs = torch.randn(1, 8, 4096)
														
 
															+
														
 
															+    outputs_inference = []
														
 
															+    with remote_block.begin_inference_session() as sess:
														
 
															+        for i in range(inputs.shape[1]):
														
 
															+            outputs_inference.append(sess.step(inputs[:, i : i + 1, :]))
														
 
															+    outputs_inference = torch.cat(outputs_inference, dim=1)
														
 
															+
														
 
															+    ref_blocks = [
														
 
															+        load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
														
 
															+        load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32)
														
 
															+    ]
														
 
															+    outputs_ref = []
														
 
															+    caches = [None, None]
														
 
															+    for i in range(inputs.shape[1]):
														
 
															+        new_caches = []
														
 
															+        hidden_states = inputs[:, i : i + 1, :]
														
 
															+        for ref_block, cache in zip(ref_blocks, caches):
														
 
															+            with torch.no_grad():
														
 
															+                hidden_states, new_cache = ref_block.forward(hidden_states, use_cache=True, layer_past=cache)
														
 
															+                new_caches.append(new_cache)
														
 
															+
														
 
															+        outputs_ref.append(hidden_states)
														
 
															+        caches = new_caches
														
 
															+    outputs_ref = torch.cat(outputs_ref, dim=1)
														
 
															+    assert torch.allclose(outputs_ref, outputs_inference, rtol=0, atol=atol_inference)