2 years ago · ed8d7f41b8
--- a/src/petals/server/backend.py
+++ b/src/petals/server/backend.py
@@ -53,13 +53,22 @@ class TransformerBackend(ModuleBackend):
 
				         max_batch_size = self.forward_pool.max_batch_size
			
 
				         device = self.module.devices[self.module.output_device_index]
			
 
				         self.inference_pool = PrioritizedTaskPool(
			
 
				-            self.inference_step, max_batch_size=max_batch_size, device=device, name=f"{self.name}_inference"
			
 
				+            lambda args, kwargs: self.inference_step(*args, **kwargs),
			
 
				+            max_batch_size=max_batch_size,
			
 
				+            device=device,
			
 
				+            name=f"{self.name}_inference",
			
 
				         )  # note: inference_pools may be merged later, see merge_inference_pools_inplace
			
 
				         self.forward_pool = PrioritizedTaskPool(
			
 
				-            self.forward, max_batch_size=max_batch_size, device=device, name=f"{self.name}_forward"
			
 
				+            lambda args, kwargs: self.forward(*args, **kwargs),
			
 
				+            max_batch_size=max_batch_size,
			
 
				+            device=device,
			
 
				+            name=f"{self.name}_forward",
			
 
				         )
			
 
				         self.backward_pool = PrioritizedTaskPool(
			
 
				-            self.backward, max_batch_size=max_batch_size, device=device, name=f"{self.name}_backward"
			
 
				+            lambda args, kwargs: self.backward(*args, **kwargs),
			
 
				+            max_batch_size=max_batch_size,
			
 
				+            device=device,
			
 
				+            name=f"{self.name}_backward",
			
 
				         )
			
 
				 
			
 
				         self.dtype = backend_dtype
			
@@ -96,27 +105,25 @@ class TransformerBackend(ModuleBackend):
 
				             cache_tensors.extend((keys, values))
			
 
				         return cache_tensors
			
 
				 
			
 
				-    def forward(self, *args: torch.Tensor, active_adapter: Optional[str], **kwargs) -> Tuple[torch.Tensor, ...]:
			
 
				+    def forward(self, active_adapter: Optional[str], *args: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, ...]:
			
 
				         with self._peft_module.using_adapter(active_adapter), torch.no_grad():
			
 
				             return self.module(*args, **kwargs)
			
 
				 
			
 
				     def backward(
			
 
				-        self, grad_outputs: torch.Tensor, *args, active_adapter: Optional[str], **kwargs
			
 
				+        self, active_adapter: Optional[str], grad_outputs: torch.Tensor, *args, **kwargs
			
 
				     ) -> Tuple[torch.Tensor, ...]:
			
 
				         assert any(x.requires_grad for x in nested_flatten((args, kwargs)) if isinstance(x, torch.Tensor))
			
 
				         with self._peft_module.using_adapter(active_adapter), torch.enable_grad():
			
 
				             (outputs,) = self.module(*args, **kwargs)
			
 
				             assert isinstance(outputs, torch.Tensor) and outputs.shape == grad_outputs.shape
			
 
				             torch.autograd.backward((outputs,), grad_tensors=(grad_outputs,), create_graph=False, retain_graph=False)
			
 
				-        return nested_map(lambda x: x.grad if isinstance(x.grad, torch.Tensor) and x.requires_grad else None)
			
 
				+        return nested_map(
			
 
				+            lambda x: x.grad if isinstance(x.grad, torch.Tensor) and x.requires_grad else None, (args, kwargs)
			
 
				+        )
			
 
				 
			
 
				     @torch.inference_mode()
			
 
				     def inference_step(
			
 
				-        self,
			
 
				-        hidden_states: torch.Tensor,
			
 
				-        hypo_ids: torch.LongTensor,
			
 
				-        kwargs: Dict[str, torch.Tensor],
			
 
				-        inference_info: InferenceMetadata,
			
 
				+        self, hidden_states: torch.Tensor, hypo_ids: torch.LongTensor, inference_info: InferenceMetadata, **kwargs
			
 
				     ) -> Tuple[torch.Tensor, ...]:
			
 
				         assert hidden_states.ndim == 3, "expected hidden states to be 3-dimensional: [batch_size, seq_len, hid_size]"
			
 
				         seq_len = hidden_states.shape[1]
			
@@ -217,8 +224,9 @@ def merge_inference_pools_inplace(backends: Dict[ExpertUID, TransformerBackend])
 
				     """Replace each backend's rpc_inference pools with a combined pool runs multiple blocks in one call"""
			
 
				     assert len(backends) != 0 and all(isinstance(b, TransformerBackend) for b in backends.values())
			
 
				     first_pool = next(iter(backends.values())).inference_pool
			
 
				+    merged_inference_func = _MergedInferenceStep(backends)
			
 
				     merged_pool = PrioritizedTaskPool(
			
 
				-        _MergedInferenceStep(backends),
			
 
				+        lambda args, kwargs: merged_inference_func(*args, **kwargs),
			
 
				         max_batch_size=first_pool.max_batch_size,
			
 
				         device=first_pool.device,
			
 
				         name=f"merged_inference",
			
@@ -237,9 +245,9 @@ class _MergedInferenceStep:
 
				         self,
			
 
				         hidden_states: torch.Tensor,
			
 
				         hypo_ids: torch.LongTensor,
			
 
				-        backend_kwargs: Sequence[Dict[str, torch.Tensor]],
			
 
				         inference_infos: Sequence[InferenceMetadata],
			
 
				         *optional_prompts: Optional[torch.Tensor],
			
 
				+        backend_kwargs: Sequence[Dict[str, torch.Tensor]],
			
 
				     ) -> Tuple[torch.Tensor, ...]:
			
 
				         assert (
			
 
				             len(inference_infos) == len(optional_prompts) == len(backend_kwargs)
			
@@ -248,6 +256,6 @@ class _MergedInferenceStep:
 
				             if optional_prompt is not None:
			
 
				                 hidden_states[:, : optional_prompt.shape[1]] += optional_prompt
			
 
				             (hidden_states,) = self.backends[inference_info.uid].inference_step(
			
 
				-                hidden_states, hypo_ids, kwargs, inference_info
			
 
				+                hidden_states, hypo_ids, inference_info, **kwargs
			
 
				             )
			
 
				         return (hidden_states,)
			
--- a/src/petals/server/block_functions.py
+++ b/src/petals/server/block_functions.py
@@ -66,8 +66,8 @@ async def run_rpc_forward(
 
				             hidden_states, points=points / len(requested_backends), backend=backend, type="forward"
			
 
				         )
			
 
				         (hidden_states,) = await backend.forward_pool.submit_task(
			
 
				+            active_adapter,
			
 
				             hidden_states,
			
 
				-            active_adapter=active_adapter,
			
 
				             **kwargs,
			
 
				             priority=priority,
			
 
				             size=num_tokens,
			
@@ -113,7 +113,7 @@ async def run_rpc_backward(
 
				             hidden_states, points=points / len(requested_backends), backend=backend, type="forward_in_backward"
			
 
				         )
			
 
				         (hidden_states,) = await backend.forward_pool.submit_task(
			
 
				-            hidden_states, active_adapter, **kwargs, priority=priority, size=num_tokens
			
 
				+            active_adapter, hidden_states, **kwargs, priority=priority, size=num_tokens
			
 
				         )
			
 
				 
			
 
				         assert isinstance(hidden_states, torch.Tensor)
			
@@ -131,7 +131,7 @@ async def run_rpc_backward(
 
				             inp, grad_outputs, points=points / len(requested_backends), backend=backend, type="backward"
			
 
				         )
			
 
				         (grad_outputs,) = await backend.backward_pool.submit_task(
			
 
				-            inp, grad_outputs, active_adapter, **kwargs, priority=priority, size=num_tokens
			
 
				+            active_adapter, grad_outputs, inp, **kwargs, priority=priority, size=num_tokens
			
 
				         )
			
 
				 
			
 
				         assert isinstance(grad_outputs, torch.Tensor)
			
@@ -211,7 +211,7 @@ async def iterate_rpc_inference(
 
				                     hypo_ids,
			
 
				                     inference_infos,
			
 
				                     *prompts,
			
 
				-                    backend_kwargs,
			
 
				+                    backend_kwargs=backend_kwargs,
			
 
				                     priority=priority,
			
 
				                     size=num_tokens,
			
 
				                 )
			
@@ -221,7 +221,13 @@ async def iterate_rpc_inference(
 
				                 ):
			
 
				                     inference_infos = (InferenceMetadata(uid, prefix_length, tuple(handles), active_adapter),)
			
 
				                     (hidden_states,) = await backend.inference_pool.submit_task(
			
 
				-                        hidden_states, hypo_ids, inference_infos, prompt, **kwargs, priority=priority, size=num_tokens
			
 
				+                        hidden_states,
			
 
				+                        hypo_ids,
			
 
				+                        inference_infos,
			
 
				+                        prompt,
			
 
				+                        backend_kwargs=(kwargs,),
			
 
				+                        priority=priority,
			
 
				+                        size=num_tokens,
			
 
				                     )
			
 
				 
			
 
				         # serialize and send last layer outputs
			
@@ -250,6 +256,9 @@ def _check_inputs(
 
				             f"(one for each block). Found {len(backend_kwargs)} instead."
			
 
				         )
			
 
				     if len(backend_kwargs) == 1:
			
 
				-        backend_kwargs = (backend_kwargs,) * len(requested_backends)
			
 
				+        backend_kwargs = backend_kwargs * len(requested_backends)
			
 
				     assert len(backend_kwargs) == len(requested_backends)
			
 
				+    for i, kwargs in enumerate(backend_kwargs):
			
 
				+        if not isinstance(kwargs, dict):
			
 
				+            raise RuntimeError(f"Expected kwargs for block {i} to be a dictionary, got {type(kwargs)}")
			
 
				     return args, backend_kwargs
			
--- a/src/petals/server/task_pool.py
+++ b/src/petals/server/task_pool.py
@@ -4,14 +4,17 @@ import threading
 
				 import time
			
 
				 from concurrent.futures._base import PENDING
			
 
				 from dataclasses import dataclass, field
			
 
				+from functools import partial
			
 
				 from queue import PriorityQueue
			
 
				 from typing import Any, List, Optional, Sequence, Tuple, Union
			
 
				 
			
 
				 import torch
			
 
				-from hivemind import get_logger
			
 
				+from hivemind import get_logger, nested_map
			
 
				 from hivemind.moe.server.task_pool import TaskPoolBase
			
 
				 from hivemind.utils.mpfuture import ALL_STATES, MPFuture
			
 
				 
			
 
				+from petals.utils.packaging import pack_args_kwargs, unpack_args_kwargs
			
 
				+
			
 
				 logger = get_logger(__name__)
			
 
				 
			
 
				 
			
@@ -19,9 +22,10 @@ logger = get_logger(__name__)
 
				 class Task:
			
 
				     priority: float
			
 
				     time_submitted: float
			
 
				+    size: int
			
 
				     future: MPFuture = field(compare=False)
			
 
				-    args: Sequence[Union[torch.Tensor, Any]] = field(compare=False)
			
 
				-    size: int = 1
			
 
				+    flat_tensors: Sequence[torch.Tensor] = field(compare=False)
			
 
				+    structure: Any
			
 
				 
			
 
				     @property
			
 
				     def uid(self) -> int:
			
@@ -105,14 +109,13 @@ class PrioritizedTaskPool(TaskPoolBase):
 
				             logger.warning(f"{self.__class__.__name__} failed to shut down gracefully, sending SIGTERM")
			
 
				             self.terminate()
			
 
				 
			
 
				-    def submit_task(self, *args: Any, priority: float = 0.0, size: int = 1) -> MPFuture:
			
 
				+    def submit_task(self, *args: Any, priority: float = 0.0, size: int = 1, **kwargs: Any) -> MPFuture:
			
 
				         """Add task to this pool's queue, return Future for its output"""
			
 
				         future = MPFuture()
			
 
				         # Remove shmem from MPFuture. This disables the .cancel() feature but
			
 
				         # saves the server from "could not unlink the shared memory file" crashes during rebalancing
			
 
				         future._shared_state_code = torch.tensor([ALL_STATES.index(PENDING)], dtype=torch.uint8)
			
 
				-
			
 
				-        task = Task(priority, time.monotonic(), future, args, size=size)
			
 
				+        task = Task(priority, time.monotonic(), size, future, *pack_args_kwargs(*args, **kwargs))
			
 
				         if task.size > self.max_batch_size:
			
 
				             exc = ValueError(f"Task size greater than max_batch_size ({self.max_batch_size}), it can't be processed")
			
 
				             task.future.set_exception(exc)
			
@@ -125,25 +128,25 @@ class PrioritizedTaskPool(TaskPoolBase):
 
				 
			
 
				     def load_batch_to_runtime(
			
 
				         self, timeout: Optional[float] = None, device: Optional[torch.device] = None
			
 
				-    ) -> Tuple[Any, List[torch.Tensor]]:
			
 
				+    ) -> Tuple[int, Any]:
			
 
				         """receive next batch of arrays"""
			
 
				         device = device if device is not None else self.device
			
 
				         task = self._ordered_tasks.get(block=True, timeout=timeout)
			
 
				-        batch_inputs = [_move_to_device_if_tensor(arg, device, share_memory=False) for arg in task.args]
			
 
				+        device_flat_tensors = [_move_to_device_if_tensor(arg, device, share_memory=False) for arg in task.flat_tensors]
			
 
				         self._dispatched_tasks[task.uid] = task
			
 
				         self.batch_receiver.recv()  # reduce the number of active batches
			
 
				         if not self._ordered_tasks.empty():
			
 
				             first_remaining_task: Task = self._ordered_tasks.queue[0]
			
 
				             self.priority = (first_remaining_task.priority, first_remaining_task.time_submitted)
			
 
				-        return task.uid, batch_inputs
			
 
				+        return task.uid, unpack_args_kwargs(device_flat_tensors, task.structure)
			
 
				 
			
 
				     def send_outputs_from_runtime(self, uid: int, batch_outputs: List[torch.Tensor]):
			
 
				         """send results for a processed batch, previously loaded through load_batch_to_runtime"""
			
 
				-        batch_outputs = [_move_to_device_if_tensor(output, device="cpu", share_memory=True) for output in batch_outputs]
			
 
				+        batch_outputs = nested_map(partial(_move_to_device_if_tensor, device="cpu", share_memory=True), batch_outputs)
			
 
				         task = self._dispatched_tasks.pop(uid, None)
			
 
				         if task is None:
			
 
				             logger.error(
			
 
				-                f"Internal error: task task with index {uid} is missing from the dictionary; " f"Could not set result"
			
 
				+                f"Internal error: task task with index {uid} is missing from the dictionary; Could not set result"
			
 
				             )
			
 
				         else:
			
 
				             task.future.set_result(batch_outputs)
			
--- a/src/petals/utils/packaging.py
+++ b/src/petals/utils/packaging.py
@@ -1,4 +1,4 @@
 
				-from typing import Any, Tuple, Sequence
			
 
				+from typing import Any, Sequence, Tuple
			
 
				 
			
 
				 import torch
			
 
				 from hivemind import nested_flatten, nested_pack