2 years ago · d2d0403a11
--- a/src/petals/bloom/block.py
+++ b/src/petals/bloom/block.py
@@ -18,20 +18,19 @@ class WrappedBloomBlock(BloomBlock):
 
															     def forward(
														
 
															         self,
														
 
															         hidden_states: torch.Tensor,
														
 
															-        attention_mask: Optional[torch.Tensor] = None,
														
 
															+        attention_mask: torch.Tensor,
														
 
															         *args,
														
 
															         alibi: Optional[torch.Tensor] = None,
														
 
															         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
														
 
															         **kwargs
														
 
															     ):
														
 
															-        assert attention_mask is None
														
 
															         batch_size, seq_length = hidden_states.shape[:2]
														
 
															         past_length = 0 if layer_past is None else layer_past[0].shape[-1]
														
 
															         if alibi is None:
														
 
															             alibi = build_alibi_tensor(attention_mask, num_heads=self.num_heads, dtype=hidden_states.dtype)
														
 
															-        causal_mask = self._prepare_attn_mask(attention_mask, (batch_size, seq_length), past_length)
														
 
															+        attention_mask = self._prepare_attn_mask(attention_mask, (batch_size, seq_length), past_length)
														
 
															         return super().forward(
														
 
															-            hidden_states, *args, attention_mask=causal_mask, alibi=alibi, layer_past=layer_past, **kwargs
														
 
															+            hidden_states, *args, attention_mask=attention_mask, alibi=alibi, layer_past=layer_past, **kwargs
														
 
															         )
														
 
															     def _prepare_attn_mask(
														
--- a/src/petals/client/remote_forward_backward.py
+++ b/src/petals/client/remote_forward_backward.py
@@ -88,8 +88,8 @@ async def run_remote_forward(
 
															     # Modify forward_schema to support prompts
														
 
															     args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															     # TODO: rm this assert when support arbitrary number of input tensors
														
 
															-    assert len(args_schema) == 1 and len(inputs) == 2
														
 
															-    forward_schema_with_prompts = (tuple(args_schema * len(inputs)), kwargs_schema)
														
 
															+    assert len(args_schema) == 2 and len(inputs) == 3
														
 
															+    forward_schema_with_prompts = ((args_schema[0], args_schema[1], args_schema[0]), kwargs_schema)
														
 
															     if not nested_compare(forward_inputs, forward_schema_with_prompts):
														
 
															         raise TypeError(f"Inputs do not match expert input schema. Did you pass the right number of parameters?")
														
@@ -135,7 +135,7 @@ async def run_remote_backward(
 
															     # Modify forward_schema to support prompts
														
 
															     args_schema, kwargs_schema = rpc_info["forward_schema"]
														
 
															-    assert len(args_schema) == 1 and isinstance(inputs, torch.Tensor)
														
 
															+    assert len(args_schema) == 2 and isinstance(inputs, torch.Tensor)
														
 
															     # TODO generalize this
														
 
															     prompts_schema = next(iter(args_schema))
														
 
															     backward_schema = tuple(nested_flatten((rpc_info["forward_schema"], rpc_info["outputs_schema"], prompts_schema)))
														
--- a/src/petals/client/remote_model.py
+++ b/src/petals/client/remote_model.py
@@ -184,16 +184,16 @@ class DistributedBloomModel(_LowCPUMemoryMixin, BloomModel):
 
															         if inputs_embeds is None:
														
 
															             inputs_embeds = self.word_embeddings(input_ids)
														
 
															+        batch_size = inputs_embeds.shape[0]
														
 
															         if self.config.tuning_mode and "ptune" in self.config.tuning_mode:
														
 
															-            batch_size = inputs_embeds.shape[0]
														
 
															             prompts, intermediate_prompts = self.get_prompt(batch_size)
														
 
															             inputs_embeds = torch.cat([prompts, inputs_embeds], dim=1)
														
 
															-        if attention_mask is None:
														
 
															-            attention_mask = torch.ones((batch_size, input_shape[-1]), device=hidden_states.device)
														
 
															-
														
 
															         hidden_states = self.word_embeddings_layernorm(inputs_embeds)
														
 
															         output_shape = input_shape + (hidden_states.size(-1),)
														
 
															+        
														
 
															+        if attention_mask is None:
														
 
															+            attention_mask = torch.ones((batch_size, hidden_states.size(1)), device=hidden_states.device)
														
 
															         if self.config.tuning_mode and "ptune" in self.config.tuning_mode:
														
 
															             hidden_states = self.h(hidden_states, attention_mask, prompts=intermediate_prompts)
														
--- a/src/petals/client/remote_sequential.py
+++ b/src/petals/client/remote_sequential.py
@@ -51,10 +51,10 @@ class RemoteSequential(nn.Module):
 
															             assert isinstance(sequence_manager.sequence_info.block_uids, tuple)
														
 
															             self.is_subsequence = self.sequence_manager.sequence_info.block_uids != block_uids
														
 
															-    def forward(self, inputs: torch.Tensor, prompts: torch.Tensor = DUMMY):
														
 
															+    def forward(self, inputs: torch.Tensor, attention_mask: torch.Tensor, prompts: torch.Tensor = DUMMY):
														
 
															         assert inputs.ndim == 3, "inputs must be a tensor of shape [batch_size, seq_length, hidden_size]"
														
 
															         assert inputs.shape[1] <= 2048, "The sequence length is capped at 2048 tokens in this version"
														
 
															-        outputs = _RemoteSequentialAutogradFunction.apply(inputs, prompts, self.sequence_manager)
														
 
															+        outputs = _RemoteSequentialAutogradFunction.apply(inputs, attention_mask, prompts, self.sequence_manager)
														
 
															         return outputs
														
 
															     def __getitem__(self, ix: Union[int, slice]) -> RemoteSequential:
														
--- a/src/petals/client/sequential_autograd.py
+++ b/src/petals/client/sequential_autograd.py
@@ -25,6 +25,7 @@ MAX_TOKENS_IN_BATCH = 1024
 
															 async def sequential_forward(
														
 
															     inputs: torch.Tensor,
														
 
															+    attention_mask: torch.Tensor,
														
 
															     prompts: torch.Tensor,
														
 
															     sequence_manager: RemoteSequenceManager,
														
 
															     start_index: int = 0,
														
@@ -37,10 +38,12 @@ async def sequential_forward(
 
															     """
														
 
															     assert isinstance(inputs, torch.Tensor) and inputs.ndim == 3, f"{type(inputs)}: {inputs.ndim}"
														
 
															+    assert isinstance(attention_mask, torch.Tensor) and attention_mask.ndim == 2, f"{type(attention_mask)}: {attention_mask.ndim}"
														
 
															     inputs_device = inputs.device
														
 
															     inputs_dtype = inputs.dtype
														
 
															     inputs = inputs.cpu()
														
 
															+    attention_mask = attention_mask.cpu()
														
 
															     prompts = prompts.cpu()
														
 
															     end_index = end_index if end_index is not None else len(sequence_manager.block_uids)
														
@@ -68,7 +71,7 @@ async def sequential_forward(
 
															                 span = sequences.popleft()
														
 
															                 stub = TransformerConnectionHandler.get_stub(sequence_manager.p2p, span.peer_id)
														
 
															-                inputs_and_prompts = [inputs, prompts[span.start : span.end]]
														
 
															+                inputs_and_prompts = [inputs, attention_mask, prompts[span.start : span.end]]
														
 
															                 span_uids = CHAIN_DELIMITER.join(sequence_manager.block_uids[span.start : span.end])
														
 
															                 metadata = sequence_manager.get_request_metadata("rpc_forward", span_uids, *inputs_and_prompts)
														
@@ -111,6 +114,7 @@ async def sequential_forward(
 
															 async def sequential_backward(
														
 
															     grad_outputs: Sequence[torch.Tensor],
														
 
															     intermediate_inputs: List[torch.Tensor],
														
 
															+    attention_mask: torch.Tensor,
														
 
															     prompts: torch.Tensor,
														
 
															     forward_sequences: List[RemoteSpanInfo],
														
 
															     sequence_manager: RemoteSequenceManager,
														
@@ -128,6 +132,7 @@ async def sequential_backward(
 
															     grad_outputs = [tensor.cpu() for tensor in grad_outputs]
														
 
															     intermediate_inputs = [tensor.cpu() for tensor in intermediate_inputs]
														
 
															+    attention_mask = attention_mask.cpu()
														
 
															     prompts = prompts.cpu()
														
 
															     grad_prompts_reversed = []
														
@@ -160,6 +165,7 @@ async def sequential_backward(
 
															                     stub,
														
 
															                     sequence_manager.rpc_info,
														
 
															                     inputs,
														
 
															+                    attention_mask,
														
 
															                     grad_outputs,
														
 
															                     prompts[span.start : span.end],
														
 
															                     timeout=sequence_manager.request_timeout,
														
@@ -191,25 +197,25 @@ async def sequential_backward(
 
															     return grad_outputs, grad_prompts
														
 
															-async def _gather_forward(input_batches, prompt_batches, sequence_manager):
														
 
															+async def _gather_forward(input_batches, attention_mask_batches, prompt_batches, sequence_manager):
														
 
															     """Wrapper for asyncio.gather to perform parallel sequential forwards"""
														
 
															     return await asyncio.gather(
														
 
															         *[
														
 
															-            sequential_forward(input_batch, prompt_batch, sequence_manager)
														
 
															-            for input_batch, prompt_batch in zip(input_batches, prompt_batches)
														
 
															+            sequential_forward(input_batch, attention_mask_batch, prompt_batch, sequence_manager)
														
 
															+            for input_batch, attention_mask_batch, prompt_batch in zip(input_batches, attention_mask_batches, prompt_batches)
														
 
															         ]
														
 
															     )
														
 
															 async def _gather_backward(
														
 
															-    grad_output_batches, intermediate_input_batches, prompt_batches, forward_sequences, sequence_manager
														
 
															+    grad_output_batches, intermediate_input_batches, attention_mask_batches, prompt_batches, forward_sequences, sequence_manager
														
 
															 ):
														
 
															     """Wrapper for asyncio.gather to perform parallel sequential backwards"""
														
 
															     return await asyncio.gather(
														
 
															         *[
														
 
															-            sequential_backward((grad_output,), input_batch, prompt_batch, spans, sequence_manager)
														
 
															-            for grad_output, input_batch, prompt_batch, spans in zip(
														
 
															-                grad_output_batches, intermediate_input_batches, prompt_batches, forward_sequences
														
 
															+            sequential_backward((grad_output,), input_batch, attention_mask_batch, prompt_batch, spans, sequence_manager)
														
 
															+            for grad_output, input_batch, attention_mask_batch, prompt_batch, spans in zip(
														
 
															+                grad_output_batches, intermediate_input_batches, attention_mask_batches, prompt_batches, forward_sequences
														
 
															             )
														
 
															         ]
														
 
															     )
														
@@ -222,16 +228,17 @@ class _RemoteSequentialAutogradFunction(torch.autograd.Function):
 
															     """
														
 
															     @staticmethod
														
 
															-    def forward(ctx, inputs: torch.Tensor, prompts: torch.Tensor, sequence_manager: RemoteSequenceManager):
														
 
															+    def forward(ctx, inputs: torch.Tensor, attention_mask: torch.Tensor, prompts: torch.Tensor, sequence_manager: RemoteSequenceManager):
														
 
															         batch_size = max(MAX_TOKENS_IN_BATCH // inputs.shape[1], 1)
														
 
															         input_batches: Sequence[torch.Tensor] = inputs.detach().split(batch_size)
														
 
															+        attention_mask_batches: Sequence[torch.Tensor] = attention_mask.detach().split(batch_size)
														
 
															         if is_dummy(prompts):
														
 
															             prompt_batches = [DUMMY] * len(input_batches)
														
 
															         else:
														
 
															             prompt_batches: Sequence[torch.Tensor] = prompts.detach().split(batch_size, dim=1)
														
 
															         sequence_manager.rpc_info  # lazy init
														
 
															-        outputs = RemoteExpertWorker.run_coroutine(_gather_forward(input_batches, prompt_batches, sequence_manager))
														
 
															+        outputs = RemoteExpertWorker.run_coroutine(_gather_forward(input_batches, attention_mask_batches, prompt_batches, sequence_manager))
														
 
															         assert len(outputs) == len(input_batches)
														
 
															         output_batches = [output[0] for output in outputs]
														
@@ -241,6 +248,7 @@ class _RemoteSequentialAutogradFunction(torch.autograd.Function):
 
															         ctx.prompt_batches = prompt_batches
														
 
															         ctx.sequence_manager = sequence_manager
														
 
															         ctx.intemediate_input_batches = intemediate_input_batches
														
 
															+        ctx.attention_mask_batches = attention_mask_batches
														
 
															         ctx.sequences_for_batches = sequences_for_batches
														
 
															         return torch.cat(output_batches, dim=0)
														
@@ -258,13 +266,14 @@ class _RemoteSequentialAutogradFunction(torch.autograd.Function):
 
															             _gather_backward(
														
 
															                 grad_output_batches,
														
 
															                 intermediate_input_batches,
														
 
															+                ctx.attention_mask_batches,
														
 
															                 ctx.prompt_batches,
														
 
															                 forward_sequences,
														
 
															                 ctx.sequence_manager,
														
 
															             )
														
 
															         )
														
 
															         grad_input_batches = [output[0][0] for output in outputs]
														
 
															-        grad_prompt_batches = [output[1] for output in outputs]
														
 
															+        grad_prompt_batches = [output[2] for output in outputs]
														
 
															         grad_inputs = torch.cat(grad_input_batches, dim=0)
														
 
															         dummy_grad_prompts = [grad_prompt is None for grad_prompt in grad_prompt_batches]
														
--- a/src/petals/server/backend.py
+++ b/src/petals/server/backend.py
@@ -84,6 +84,7 @@ class TransformerBackend(ModuleBackend):
 
															     def inference_step(
														
 
															         self,
														
 
															         hidden_states: torch.Tensor,
														
 
															+        attention_masks: torch.Tensor,
														
 
															         hypo_ids: torch.LongTensor,
														
 
															         inference_info: InferenceMetadata,
														
 
															     ) -> Tuple[torch.Tensor, ...]:
														
--- a/src/petals/server/handler.py
+++ b/src/petals/server/handler.py
@@ -405,15 +405,16 @@ async def _rpc_forward(
 
															     """
														
 
															     Run forward pass on deserialized inputs and prompts, used by rpc_forward and rpc_forward_stream
														
 
															-    :param flat_tensors: a list of tensors that includes first layer inputs, optional prompts and extra tensors
														
 
															+    :param flat_tensors: a list of tensors that includes first layer inputs, attention_mask, optional prompts and extra tensors
														
 
															     :note: some input tensors can be missing, in which case they will be replaced with dummy tensors (see is_dummy)
														
 
															     :param requested_backends: a sequence of transformer blocks in the same order as they appear in forward pass
														
 
															     :returns: hidden states after the last layer [batch_size, seq_length, hid_size]
														
 
															     """
														
 
															-    hidden_states, prompts = flat_tensors
														
 
															+    hidden_states, attention_masks, prompts = flat_tensors
														
 
															     dtype = requested_backends[0].dtype
														
 
															     # check parse input tensors and cast dtypes
														
 
															     hidden_states = hidden_states.to(dtype)
														
 
															+    attention_masks = attention_masks.to(dtype)
														
 
															     assert hidden_states.ndim == 3
														
 
															     if prompts is None or is_dummy(prompts):
														
 
															         prompts = [DUMMY] * len(requested_backends)
														
@@ -431,6 +432,7 @@ async def _rpc_forward(
 
															         )
														
 
															         (hidden_states,) = await backend.forward_pool.submit_task(
														
 
															             hidden_states,
														
 
															+            attention_masks,
														
 
															             priority=priority,
														
 
															         )
														
 
															         assert isinstance(hidden_states, torch.Tensor)
														
@@ -447,9 +449,10 @@ async def _rpc_backward(
 
															     prioritizer: TaskPrioritizerBase,
														
 
															     points: int = 0,
														
 
															 ) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
														
 
															-    inputs, grad_outputs, prompts = flat_tensors
														
 
															+    inputs, attention_masks, grad_outputs, prompts = flat_tensors
														
 
															     # Cast inputs & grad outputs to backend dtype
														
 
															     inputs = inputs.to(requested_backends[0].dtype)
														
 
															+    attention_masks = attention_masks.to(requested_backends[0].dtype)
														
 
															     grad_outputs = grad_outputs.to(requested_backends[-1].dtype)
														
 
															     if prompts is None or is_dummy(prompts):
														
@@ -469,7 +472,7 @@ async def _rpc_backward(
 
															         priority = prioritizer.prioritize(
														
 
															             inputs, points=points / len(requested_backends), backend=backend, type="forward_in_backward"
														
 
															         )
														
 
															-        (inputs,) = await backend.forward_pool.submit_task(inputs, priority=priority)
														
 
															+        (inputs,) = await backend.forward_pool.submit_task(inputs, attention_masks, priority=priority)
														
 
															         assert isinstance(inputs, torch.Tensor)
														
@@ -485,7 +488,7 @@ async def _rpc_backward(
 
															         priority = prioritizer.prioritize(
														
 
															             inp, grad_outputs, points=points / len(requested_backends), backend=backend, type="backward"
														
 
															         )
														
 
															-        (grad_outputs,) = await backend.backward_pool.submit_task(inp, grad_outputs, priority=priority)
														
 
															+        (grad_outputs,) = await backend.backward_pool.submit_task(inp, attention_masks, grad_outputs, priority=priority)
														
 
															         assert isinstance(grad_outputs, torch.Tensor)
														
 
															         if not is_dummy(prompt):