2 vuotta sitten · 55eb36ef48
--- a/src/petals/server/throughput.py
+++ b/src/petals/server/throughput.py
@@ -51,7 +51,7 @@ def get_server_throughput(
 
				     if cache_dir is None:
			
 
				         cache_dir = DEFAULT_CACHE_DIR
			
 
				     lock_path = Path(cache_dir, "throughput.lock")
			
 
				-    cache_path = Path(cache_dir, "throughput_v4.json")
			
 
				+    cache_path = Path(cache_dir, "throughput_v5.json")
			
 
				 
			
 
				     # We use the system-wide lock since only one process at a time can measure the host throughput
			
 
				     os.makedirs(lock_path.parent, exist_ok=True)
			
@@ -196,6 +196,7 @@ def measure_compute_rps(
 
				     n_steps: int,
			
 
				     inference: bool,
			
 
				 ) -> float:
			
 
				+    device = torch.device(device)
			
 
				     if not tensor_parallel_devices:
			
 
				         tensor_parallel_devices = (device,)
			
 
				     with torch.inference_mode():
			
@@ -204,13 +205,17 @@ def measure_compute_rps(
 
				 
			
 
				         cache = None
			
 
				         elapsed = 0
			
 
				-        for step in range(n_steps + 1):
			
 
				-            dummy_input = torch.randn(n_tokens, 1, config.hidden_size, device=device, dtype=dtype)
			
 
				+        dummy_input = torch.randn(1, n_tokens, config.hidden_size, device=device, dtype=dtype)
			
 
				+        _, cache = block.forward(dummy_input, use_cache=True)  # Skip the 1st step to exclude the initialization time
			
 
				+        if device.type == "cuda":
			
 
				+            torch.cuda.synchronize(device)
			
 
				 
			
 
				-            start_time = time.perf_counter()
			
 
				+        start_time = time.perf_counter()
			
 
				+        for step in range(n_steps):
			
 
				             _, cache = block.forward(dummy_input, use_cache=True, layer_past=cache if inference else None)
			
 
				-            if step >= 1:  # Skip the 1st step to exclude the initialization time
			
 
				-                elapsed += time.perf_counter() - start_time
			
 
				+        if device.type == "cuda":
			
 
				+            torch.cuda.synchronize(device)
			
 
				+        elapsed = time.perf_counter() - start_time
			
 
				         device_rps = n_steps * n_tokens / elapsed
			
 
				 
			
 
				     devices_repr = get_device_name(device)