3 年之前 · 66f1799d32
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Check out more tutorials:
 
				 
			
 
				 - **Petals** runs inference or fine-tunes large language models like [BLOOM-176B](https://huggingface.co/bigscience/bloom) by joining compute resources with people all over the Internet.
			
 
				 - One participant with weak GPU can load a small part of the model, then team up with people serving the other parts to run inference or fine-tuning.
			
 
				-- Inference takes ≈ 1 sec/token — 10x faster than possible with offloading, enough for chatbots and other interactive apps. Parallel inference takes ≈ 1 sec/batch.
			
 
				+- Inference runs at ≈ 1 sec per step (token) — 10x faster than possible with offloading, enough for chatbots and other interactive apps. Parallel inference reaches hundreds of tokens/sec.
			
 
				 - Beyond classic language model APIs — you can employ any fine-tuning and sampling methods by executing custom paths through the model or accessing its hidden states. This combines the comforts of an API with the flexibility of PyTorch.
			
 
				 
			
 
				 <p align="center">
			
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@@ -81,7 +81,7 @@ def main():
 
				                         help='Timeout (in seconds) for the whole rpc_forward/rpc_backward/rpc_forward_stream/rpc_backward_stream request')
			
 
				     parser.add_argument('--session_timeout', type=float, required=False, default=30 * 60,
			
 
				                         help='Timeout (in seconds) for the whole inference session')
			
 
				-    parser.add_argument('--step_timeout', type=float, required=False, default=60,
			
 
				+    parser.add_argument('--step_timeout', type=float, required=False, default=5 * 60,
			
 
				                         help="Timeout (in seconds) for waiting the next step's inputs inside an inference session")
			
 
				 
			
 
				     group = parser.add_mutually_exclusive_group()
			
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -65,7 +65,7 @@ class Server:
 
				         expiration: Optional[float] = None,
			
 
				         request_timeout: float = 3 * 60,
			
 
				         session_timeout: float = 30 * 60,
			
 
				-        step_timeout: float = 60,
			
 
				+        step_timeout: float = 5 * 60,
			
 
				         prefetch_batches: int = 1,
			
 
				         sender_threads: int = 1,
			
 
				         balance_quality: float = 0.75,