Explorar o código

fix cache size args, check it in tests

justheuristic %!s(int64=3) %!d(string=hai) anos
pai
achega
0b09b33d07
Modificáronse 3 ficheiros con 5 adicións e 4 borrados
  1. 1 1
      .github/workflows/run-tests.yaml
  2. 1 1
      cli/run_server.py
  3. 3 2
      src/server/server.py

+ 1 - 1
.github/workflows/run-tests.yaml

@@ -81,7 +81,7 @@ jobs:
 
           python -m cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:12 \
             --torch_dtype float32 --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 \
-            --throughput 1 &> server1.log &
+            --throughput 1 --attention_cache_bytes 0.2GiB &> server1.log &
           SERVER1_PID=$!
           
           sleep 5  # wait for the first server to initialize DHT

+ 1 - 1
cli/run_server.py

@@ -107,7 +107,7 @@ def main():
     use_auth_token = args.pop("use_auth_token")
     args["use_auth_token"] = True if use_auth_token in ("True", "true", "") else use_auth_token
 
-    server = Server.create(**args, start=True, compression=compression, cache_size_bytes=attention_cache_bytes)
+    server = Server.create(**args, start=True, compression=compression, attention_cache_bytes=attention_cache_bytes)
 
     try:
         server.join()

+ 3 - 2
src/server/server.py

@@ -110,7 +110,7 @@ class Server(threading.Thread):
         torch_dtype: str = "auto",
         revision: str = "main",
         cache_dir: Optional[str] = None,
-        cache_size_bytes: Optional[int] = None,
+        attention_cache_bytes: Optional[int] = None,
         device: Optional[Union[str, torch.device]] = None,
         initial_peers: Sequence[str] = (),
         compression=CompressionType.NONE,
@@ -146,7 +146,7 @@ class Server(threading.Thread):
         logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
 
         device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        memory_cache = MemoryCache(device, cache_size_bytes)
+        memory_cache = MemoryCache(device, attention_cache_bytes)
 
         assert isinstance(throughput, float) or throughput in ["auto", "eval"]
         if throughput in ["auto", "eval"]:
@@ -233,6 +233,7 @@ class Server(threading.Thread):
             blocks,
             throughput=throughput,
             num_connection_handlers=num_handlers,
+            inference_max_length=inference_max_length,
             device=device,
             stats_report_interval=stats_report_interval,
             update_period=update_period,