1 年之前 · 5e4d884fa2
--- a/src/petals/client/inference_session.py
+++ b/src/petals/client/inference_session.py
@@ -110,6 +110,12 @@ class _ServerInferenceSession:
 
															         if self.closed:
														
 
															             raise Exception("Session is closed, cannot perform step")
														
 
															+        if start_from_position is not None:
														
 
															+            assert start_from_position <= self._position
														
 
															+            self._position = start_from_position
														
 
															+            if self.history is not None and self.history.shape[1] >= start_from_position:
														
 
															+                self.history = self.history[:, :start_from_position, :] if start_from_position > 0 else None
														
 
															+
														
 
															         n_input_tokens = inputs.shape[1]
														
 
															         if self.history is None:
														
 
															             self.history = inputs
														
@@ -287,7 +293,6 @@ class InferenceSession:
 
															         prompts: Optional[torch.Tensor] = None,
														
 
															         hypo_ids: Optional[torch.Tensor] = None,
														
 
															     ) -> torch.Tensor:
														
 
															-
														
 
															         assert not self._closed
														
 
															         if torch.is_grad_enabled():
														
 
															             logger.warning("Running inference session with grad enabled. Gradients will *not* be propagated correctly.")
														
--- a/tests/test_speculative_generation.py
+++ b/tests/test_speculative_generation.py
@@ -29,6 +29,7 @@ def test_remote_block_with_cache_invalidation_exact_match(atol_forward=1e-4, ato
 
															             sess.position = 2
														
 
															             secondary_outputs_inference = sess.step(short_inputs[:, 2:, :])
														
 
															+            secondary_outputs_inference = sess.step(short_inputs[:, 2:, :], start_from_position=2)
														
 
															             result = torch.cat([initial_outputs_inference[:, :2, :], secondary_outputs_inference], dim=1)
														
 
															     ref_block = load_pretrained_block(MODEL_NAME, block_index, torch_dtype=torch.float32)