2 năm trước cách đây · 8af3ac3623
--- a/cli/run_server.py
+++ b/cli/run_server.py
@@ -57,6 +57,9 @@ def main():
 
															     parser.add_argument('--attn_cache_size', type=str, default=None,
														
 
															                         help='The size of GPU memory allocated for storing past attention keys/values between inference'
														
 
															                              ' steps; examples: 500MB or 1.2GB or 1073741824 (bytes); be warned: 1KB != 1KiB')
														
 
															+    parser.add_argument('--alloc_timeout', type=float, default=60,
														
 
															+                        help='If the cache is full, the server will wait for this number of seconds hoping that some memory will be freed '
														
 
															+                             'before rejecting the request')
														
 
															     parser.add_argument('--revision', type=str, default='main',
														
 
															                         help="The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models"
														
 
															                              "and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.")
														
--- a/src/server/cache.py
+++ b/src/server/cache.py
@@ -26,8 +26,9 @@ Handle = int
 
															 class MemoryCache:
														
 
															     """A shared cache for storing tensors that persist across calls. Main use case: storing past attention KVs"""
														
 
															-    def __init__(self, device: Union[str, torch.device], max_size_bytes: Optional[int]):
														
 
															+    def __init__(self, device: Union[str, torch.device], max_size_bytes: Optional[int], alloc_timeout: float):
														
 
															         self.max_size_bytes = max_size_bytes if max_size_bytes is not None else (2**64 - 1)
														
 
															+        self.alloc_timeout = alloc_timeout
														
 
															         self.device = device
														
 
															         self._lock_metadata, self.size_decreased_event = mp.Lock(), mp.Event()
														
 
															         self._current_size = mp.Value(ctypes.c_int64, 0, lock=False)
														
@@ -75,7 +76,7 @@ class MemoryCache:
 
															         try:
														
 
															             async with hivemind.utils.enter_asynchronously(self._lock_acquire_memory):
														
 
															                 if self.current_size_bytes + allocated_size_bytes > self.max_size_bytes:
														
 
															-                    await loop.run_in_executor(None, self._wait_until_available, allocated_size_bytes)
														
 
															+                    await loop.run_in_executor(None, self._wait_until_available, allocated_size_bytes, timeout=self.alloc_timeout)
														
 
															                 async with hivemind.utils.enter_asynchronously(self._lock_metadata):
														
 
															                     allocated_handle = int(self.handle_counter)
														
 
															                     self.current_size_bytes += allocated_size_bytes
														
--- a/src/server/server.py
+++ b/src/server/server.py
@@ -55,6 +55,7 @@ class Server:
 
															         revision: str = "main",
														
 
															         cache_dir: Optional[str] = None,
														
 
															         attn_cache_size: Optional[int] = None,
														
 
															+        alloc_timeout: float = 60,
														
 
															         device: Optional[Union[str, torch.device]] = None,
														
 
															         compression=CompressionType.NONE,
														
 
															         stats_report_interval: Optional[int] = None,
														
@@ -110,7 +111,7 @@ class Server:
 
															         device = device or ("cuda" if torch.cuda.is_available() else "cpu")
														
 
															         self.device = device
														
 
															-        self.memory_cache = MemoryCache(device, attn_cache_size)
														
 
															+        self.memory_cache = MemoryCache(device, attn_cache_size, alloc_timeout)
														
 
															         assert isinstance(throughput, float) or throughput in ["auto", "eval"]
														
 
															         if throughput in ["auto", "eval"]: