throughput.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import fcntl
  2. import json
  3. import math
  4. import multiprocessing as mp
  5. import os
  6. import time
  7. from collections import Counter
  8. from pathlib import Path
  9. from typing import Dict, Optional, Sequence, Union
  10. import torch
  11. import torch.mps
  12. from hivemind.utils.logging import get_logger
  13. from transformers import PretrainedConfig
  14. from petals.server.block_utils import resolve_block_dtype
  15. from petals.utils.convert_block import QuantType, convert_block
  16. from petals.utils.disk_cache import DEFAULT_CACHE_DIR
  17. logger = get_logger(__name__)
  18. try:
  19. import speedtest
  20. except ImportError:
  21. raise ImportError("Please `pip install speedtest-cli==2.1.3`")
  22. if not hasattr(speedtest, "Speedtest"):
  23. raise ImportError(
  24. "You are using the wrong speedtest module. Please replace speedtest with speedtest-cli.\n"
  25. "To do that, run `pip uninstall -y speedtest`. Depending on your python environment, "
  26. "you may need to run uninstall speedtest two or more times, until it says 'not installed'.\n"
  27. "After that, please `pip install speedtest-cli==2.1.3` to install the correct version."
  28. )
  29. def get_server_throughput(
  30. model_name: str,
  31. config: PretrainedConfig,
  32. device: torch.device,
  33. dtype: Union[str, torch.dtype],
  34. *,
  35. num_blocks: int,
  36. quant_type: QuantType,
  37. tensor_parallel_devices: Sequence[torch.device],
  38. reachable_via_relay: bool,
  39. relay_penalty: float = 0.2,
  40. force_eval: bool = False,
  41. cache_dir: Optional[str] = None,
  42. ) -> Dict[str, float]:
  43. dtype = resolve_block_dtype(config, dtype)
  44. if cache_dir is None:
  45. cache_dir = DEFAULT_CACHE_DIR
  46. lock_path = Path(cache_dir, "throughput.lock")
  47. cache_path = Path(cache_dir, "throughput_v5.json")
  48. # We use the system-wide lock since only one process at a time can measure the host throughput
  49. os.makedirs(lock_path.parent, exist_ok=True)
  50. with open(lock_path, "wb") as lock_fd:
  51. logger.info("Loading throughput info")
  52. fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX)
  53. # The OS will release the lock when lock_fd is closed or the process is killed
  54. cache_key = f"model_{model_name}"
  55. cache_key += f"_device_{get_device_name(device).replace(' ', '_')}"
  56. cache_key += f"_dtype_{get_dtype_name(dtype, quant_type)}"
  57. if len(tensor_parallel_devices) > 1:
  58. for i, device_i in enumerate(tensor_parallel_devices):
  59. cache_key += f"_tp{i}_{get_device_name(device_i).replace(' ', '_')}"
  60. cache = {}
  61. try:
  62. if not force_eval and os.path.exists(cache_path):
  63. with open(cache_path) as cache_fd:
  64. cache = json.load(cache_fd)
  65. assert isinstance(cache, dict)
  66. except Exception:
  67. logger.exception(f"Failed to read throughput info from {cache_path}")
  68. cache = {}
  69. if cache_key not in cache:
  70. cache[cache_key] = measure_throughput_info(
  71. config, device, dtype, quant_type=quant_type, tensor_parallel_devices=tensor_parallel_devices
  72. )
  73. try:
  74. os.makedirs(cache_path.parent, exist_ok=True)
  75. with open(cache_path, "w") as cache_fd:
  76. json.dump(cache, cache_fd)
  77. except Exception:
  78. logger.exception(f"Failed to save throughput info in {cache_path}")
  79. throughput_info = cache[cache_key]
  80. # Most requests start at some block hosted by a server, then use all next blocks hosted on this server.
  81. # Assuming the start block index is distributed uniformly, the average number of blocks used per request is
  82. # E[Uniform{1, 2, ..., num_blocks}] = (num_blocks + 1) / 2
  83. average_blocks_used = (num_blocks + 1) / 2
  84. throughput = throughput_info["forward_rps"] / average_blocks_used
  85. network_rps = throughput_info["network_rps"] * (relay_penalty if reachable_via_relay else 1)
  86. throughput = min(throughput, network_rps)
  87. throughput_info["throughput"] = throughput
  88. logger.info(f"Reporting throughput: {throughput:.1f} tokens/sec for {num_blocks} blocks")
  89. return throughput_info
  90. def measure_throughput_info(
  91. config: PretrainedConfig,
  92. device: torch.device,
  93. dtype: torch.dtype,
  94. *,
  95. quant_type: QuantType,
  96. tensor_parallel_devices: Sequence[torch.device],
  97. ) -> Dict[str, float]:
  98. logger.info(
  99. "Measuring network and compute throughput. This takes about a minute and will be cached for future runs"
  100. )
  101. return {
  102. "inference_rps": measure_compute_rps(
  103. config,
  104. device,
  105. dtype,
  106. quant_type=quant_type,
  107. tensor_parallel_devices=tensor_parallel_devices,
  108. n_tokens=1,
  109. n_steps=100,
  110. inference=True,
  111. ),
  112. "forward_rps": measure_compute_rps(
  113. config,
  114. device,
  115. dtype,
  116. quant_type=quant_type,
  117. tensor_parallel_devices=tensor_parallel_devices,
  118. n_tokens=1024,
  119. n_steps=10,
  120. inference=False,
  121. ),
  122. "network_rps": measure_network_rps(config),
  123. }
  124. def measure_network_rps(
  125. config: PretrainedConfig, *, timeout: float = 60, default_speed: float = 100e6 # 100 Mbit/s
  126. ) -> Optional[float]:
  127. bits_per_request = config.hidden_size * 16 # Clients usually send 16-bit tensors for forward/backward
  128. try:
  129. pipe_recv, pipe_send = mp.Pipe(duplex=False)
  130. process = mp.Process(target=_measure_bits_per_second, args=(pipe_send,))
  131. process.start()
  132. if not pipe_recv.poll(timeout):
  133. process.terminate()
  134. raise RuntimeError(f"speedtest did not finish in {timeout} seconds")
  135. network_info = pipe_recv.recv()
  136. if "exception" in network_info:
  137. raise RuntimeError(f"speedtest failed: {network_info['exception']}")
  138. network_rps = min(network_info["download"], network_info["upload"]) / bits_per_request
  139. if network_rps == 0:
  140. raise RuntimeError("speedtest has returned network_rps == 0")
  141. logger.info(
  142. f"Network throughput: {network_rps:.1f} tokens/sec "
  143. f"({network_info['download'] / 1e6:.2f} Mbit/s on download, "
  144. f"{network_info['upload'] / 1e6:.2f} Mbit/s on upload)"
  145. )
  146. return network_rps
  147. except RuntimeError as e:
  148. logger.info(f"Network throughput is not available: {e}. Using default of {default_speed / 1e6:.2f} Mbit/s")
  149. return default_speed / bits_per_request
  150. def _measure_bits_per_second(pipe_send: mp.Pipe):
  151. try:
  152. s = speedtest.Speedtest()
  153. s.get_servers()
  154. s.get_best_server()
  155. s.download()
  156. s.upload()
  157. pipe_send.send(s.results.dict())
  158. except Exception as e:
  159. pipe_send.send({"exception": repr(e)})
  160. def measure_compute_rps(
  161. config: PretrainedConfig,
  162. device: torch.device,
  163. dtype: torch.dtype,
  164. *,
  165. quant_type: QuantType,
  166. tensor_parallel_devices: Sequence[torch.device],
  167. n_tokens: int,
  168. n_steps: int,
  169. inference: bool,
  170. ) -> float:
  171. device = torch.device(device)
  172. if not tensor_parallel_devices:
  173. tensor_parallel_devices = (device,)
  174. with torch.inference_mode():
  175. block = config.block_class(config).to(dtype)
  176. block = convert_block(block, 0, config, tensor_parallel_devices, device, quant_type=quant_type, freeze=True)
  177. cache = None
  178. elapsed = 0
  179. dummy_input = torch.randn(1, n_tokens, config.hidden_size, device=device, dtype=dtype)
  180. _, cache = block.forward(dummy_input, use_cache=True) # Skip the 1st step to exclude the initialization time
  181. synchronize(device)
  182. start_time = time.perf_counter()
  183. for _ in range(n_steps):
  184. _, cache = block.forward(dummy_input, use_cache=True, layer_past=cache if inference else None)
  185. synchronize(device)
  186. elapsed = time.perf_counter() - start_time
  187. device_rps = n_steps * n_tokens / elapsed
  188. devices_repr = get_device_name(device)
  189. if len(tensor_parallel_devices) > 1:
  190. device_names = tuple(map(get_device_name, map(torch.device, tensor_parallel_devices)))
  191. devices_repr = ", ".join(f"{count}x {name}" for name, count in Counter(device_names).most_common())
  192. logger.info(
  193. f"{'Inference' if inference else 'Forward pass'} throughput: {device_rps:.1f} tokens/sec per block "
  194. f"({n_tokens} tokens/batch, {devices_repr}, {get_dtype_name(dtype, quant_type)})"
  195. )
  196. return device_rps
  197. def synchronize(device: torch.device):
  198. if device.type == "cuda":
  199. torch.cuda.synchronize(device)
  200. elif device.type == "mps":
  201. torch.mps.synchronize()
  202. def get_device_name(device: torch.device) -> str:
  203. return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else device.type.upper()
  204. def get_dtype_name(dtype: torch.dtype, quant_type: QuantType) -> str:
  205. name = str(dtype).replace("torch.", "")
  206. if quant_type != QuantType.NONE:
  207. name += f", quantized to {quant_type.name.lower()}"
  208. return name