2 vuotta sitten · 2e760319ab
--- a/src/petals/client/inference_session.py
+++ b/src/petals/client/inference_session.py
@@ -93,7 +93,7 @@ class _ServerInferenceSession:
 
				     ) -> torch.Tensor:
			
 
				         """
			
 
				         Inference step: send a chunk of input tensors and receive a chunk of outputs
			
 
				-        :prompts: optional DEEP prompts, added to a prefix of each layer's outputs,
			
 
				+        :param prompts: optional DEEP prompts, added to a prefix of each layer's outputs,
			
 
				           if specified, deep prompts should have shape [num_layers, batch_size, prefix_len, hid_size]
			
 
				         """
			
 
				         if self.closed:
			
--- a/src/petals/client/remote_forward_backward.py
+++ b/src/petals/client/remote_forward_backward.py
@@ -19,30 +19,30 @@ from petals.data_structures import ModuleUID, RPCInfo
 
				 
			
 
				 
			
 
				 async def _forward_unary(
			
 
				-    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig, **kwargs
			
 
				+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig
			
 
				 ) -> List[torch.Tensor]:
			
 
				     outputs: runtime_pb2.ExpertResponse = await stub.rpc_forward(
			
 
				-        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors), **kwargs),
			
 
				+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors)),
			
 
				         timeout=config.request_timeout,
			
 
				     )
			
 
				     return [deserialize_torch_tensor(t) for t in outputs.tensors]
			
 
				 
			
 
				 
			
 
				 async def _backward_unary(
			
 
				-    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig, **kwargs
			
 
				+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig
			
 
				 ) -> List[torch.Tensor]:
			
 
				     grad_inputs: runtime_pb2.ExpertResponse = await stub.rpc_backward(
			
 
				-        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors), **kwargs),
			
 
				+        runtime_pb2.ExpertRequest(uid=uid, tensors=list(serialized_tensors)),
			
 
				         timeout=config.request_timeout,
			
 
				     )
			
 
				     return [deserialize_torch_tensor(t) for t in grad_inputs.tensors]
			
 
				 
			
 
				 
			
 
				 async def _forward_stream(
			
 
				-    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig, **kwargs
			
 
				+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig
			
 
				 ) -> List[torch.Tensor]:
			
 
				     parts = (
			
 
				-        runtime_pb2.ExpertRequest(uid=uid, tensors=[part], **kwargs)
			
 
				+        runtime_pb2.ExpertRequest(uid=uid, tensors=[part])
			
 
				         for tensor in serialized_tensors
			
 
				         for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE)
			
 
				     )
			
@@ -52,10 +52,10 @@ async def _forward_stream(
 
				 
			
 
				 
			
 
				 async def _backward_stream(
			
 
				-    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig, **kwargs
			
 
				+    uid: str, serialized_tensors: Iterable[runtime_pb2.Tensor], stub, config: ClientConfig
			
 
				 ) -> List[torch.Tensor]:
			
 
				     parts = (
			
 
				-        runtime_pb2.ExpertRequest(uid=uid, tensors=[part], **kwargs)
			
 
				+        runtime_pb2.ExpertRequest(uid=uid, tensors=[part])
			
 
				         for tensor in serialized_tensors
			
 
				         for part in split_for_streaming(tensor, DEFAULT_MAX_MSG_SIZE)
			
 
				     )
			
@@ -68,31 +68,19 @@ async def run_remote_forward(
 
				     uid: ModuleUID,
			
 
				     stub: StubBase,
			
 
				     rpc_info: RPCInfo,
			
 
				-    *inputs: torch.Tensor,
			
 
				+    *forward_inputs: torch.Tensor,
			
 
				     config: ClientConfig,
			
 
				     metadata: Optional[bytes] = None,
			
 
				-    **kwargs,
			
 
				 ) -> Tuple[torch.Tensor, ...]:
			
 
				     """
			
 
				     Serializes input tensors and calls "rpc_forward" on a remote server.
			
 
				     Mostly adapted from https://github.com/learning-at-home/hivemind/blob/7a7c93aefffc9494c39e7b170c07cb06d8c09c4c/hivemind/moe/client/expert.py#L198
			
 
				     but without RemoteExpertWorker.run_coroutine() call that leads to deadlock here.
			
 
				     """
			
 
				-
			
 
				-    # Note: *inputs are flattened input tensors that follow the expert's info['input_schema']
			
 
				-    # detach to avoid pickling the computation graph
			
 
				-    assert len(kwargs) == len(rpc_info["keyword_names"]), f"Keyword args should be {rpc_info['keyword_names']}"
			
 
				-    kwargs = {key: kwargs[key] for key in rpc_info["keyword_names"]}
			
 
				-
			
 
				-    # Note: we put keyword arguments in the same order as on a server to prevent f(a=1, b=2) != f(b=2, a=1) errors
			
 
				-    forward_inputs = tuple(nested_flatten((inputs, kwargs)))
			
 
				     args_schema, kwargs_schema = rpc_info["forward_schema"]
			
 
				     compression = args_schema[0].compression
			
 
				     forward_schema = tuple(BatchTensorDescriptor.from_tensor(arg, compression) for arg in forward_inputs)
			
 
				     inputs = tuple(tensor.cpu().detach() for tensor in forward_inputs)
			
 
				-    # TODO: create more explicit way to check servers schema and client's structure
			
 
				-    assert len(inputs) >= len(args_schema) + 1, "Inputs and prompt tensors are necessary for a forward step"
			
 
				-
			
 
				     # Asynchronous serialization
			
 
				     loop = asyncio.get_running_loop()
			
 
				     serialized_tensors = await asyncio.gather(
			
@@ -106,7 +94,7 @@ async def run_remote_forward(
 
				     size = sum(t.element_size() * t.nelement() for t in inputs)
			
 
				     forward_fn = _forward_stream if size > MAX_UNARY_PAYLOAD_SIZE // 2 else _forward_unary
			
 
				     # Hotfix: we use "// 2" since hivemind==1.1.5 serializes bfloat16 tensors in float32, so they take 2x more space
			
 
				-    deserialized_outputs = await forward_fn(uid, serialized_tensors, stub, config, metadata=metadata, **kwargs)
			
 
				+    deserialized_outputs = await forward_fn(uid, serialized_tensors, stub, config, metadata=metadata)
			
 
				     return nested_pack(deserialized_outputs, structure=rpc_info["outputs_schema"])
			
 
				 
			
 
				 
			
--- a/src/petals/client/sequential_autograd.py
+++ b/src/petals/client/sequential_autograd.py
@@ -4,7 +4,7 @@ A PyTorch autograd function that runs forward/backward on a sequence of remote s
 
				 import asyncio
			
 
				 import itertools
			
 
				 from collections import deque
			
 
				-from typing import List, Optional, Sequence, Tuple
			
 
				+from typing import List, Optional, Sequence, Tuple, Dict, Any
			
 
				 
			
 
				 import torch
			
 
				 from hivemind import MSGPackSerializer
			
@@ -29,14 +29,25 @@ async def sequential_forward(
 
				     sequence_manager: RemoteSequenceManager,
			
 
				     start_index: int = 0,
			
 
				     end_index: Optional[int] = None,
			
 
				+    block_kwargs: Sequence[Dict[str, Any]] = (),
			
 
				 ) -> Tuple[torch.Tensor, Sequence[torch.Tensor], Sequence[RemoteSpanInfo]]:
			
 
				     """
			
 
				     Constructs a routing path from <start_index> to <end_index>.
			
 
				     Performs chained forward for each subsequence of blocks on the path.
			
 
				     If some subsequence fails, reconstructs the remaining path and tries to finish the forward.
			
 
				+
			
 
				+    :param inputs: initial hidden states of shape [batch_size, sequence length, hidden_size]
			
 
				+    :param prompts: optional DEEP prompts, added to a prefix of each layer's outputs,
			
 
				+          if specified, deep prompts should have shape [num_layers, batch_size, prefix_len, hid_size]
			
 
				+    :param sequence_manager: a running SequenceManager used to select remote servers and handle failures
			
 
				+    :param start_index: run remote blocks starting from this index
			
 
				+    :param end_index: run remote blocks up to (but not including) this index
			
 
				+    :param block_kwargs: optional per-block keyword arguments. Must be a sequence with one dictionary for each block
			
 
				     """
			
 
				 
			
 
				     assert isinstance(inputs, torch.Tensor) and inputs.ndim == 3, f"{type(inputs)}: {inputs.ndim}"
			
 
				+    assert len(block_kwargs) in (0, 1, end_index - start_index), \
			
 
				+        f"got {end_index - start_index} blocks but {len(block_kwargs)} sets of kwargs"
			
 
				 
			
 
				     inputs_device = inputs.device
			
 
				     inputs_dtype = inputs.dtype
			
@@ -68,7 +79,8 @@ async def sequential_forward(
 
				                 span = sequences.popleft()
			
 
				 
			
 
				                 stub = TransformerConnectionHandler.get_stub(sequence_manager.state.p2p, span.peer_id)
			
 
				-                flat_tensors, args_structure = pack_args_kwargs(inputs, prompts[span.start : span.end])
			
 
				+                flat_tensors, args_structure = pack_args_kwargs(
			
 
				+                    inputs, prompts[span.start : span.end], *block_kwargs[span.start: span.end])
			
 
				 
			
 
				                 span_uids = CHAIN_DELIMITER.join(sequence_manager.block_uids[span.start : span.end])
			
 
				                 metadata = sequence_manager.get_request_metadata(