throughput.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import fcntl
  2. import json
  3. import os
  4. import subprocess
  5. import time
  6. from hashlib import sha256
  7. from pathlib import Path
  8. from typing import Optional, Union
  9. import torch
  10. from hivemind.utils.logging import get_logger
  11. from transformers import BloomConfig
  12. from petals.bloom.block import WrappedBloomBlock
  13. from petals.server.block_utils import resolve_block_dtype
  14. from petals.utils.convert_8bit import replace_8bit_linear
  15. from petals.utils.disk_cache import DEFAULT_CACHE_DIR
  16. logger = get_logger(__file__)
  17. def get_host_throughput(
  18. config: BloomConfig,
  19. device: torch.device,
  20. dtype: Union[str, torch.dtype],
  21. *,
  22. load_in_8bit: bool,
  23. force_eval: bool = False,
  24. cache_dir: Optional[str] = None,
  25. ) -> float:
  26. dtype = resolve_block_dtype(config, dtype)
  27. if cache_dir is None:
  28. cache_dir = DEFAULT_CACHE_DIR
  29. lock_path = Path(cache_dir, "throughput.lock")
  30. cache_path = Path(cache_dir, "throughput_v2.json")
  31. # We use the system-wide lock since only one process at a time can measure the host throughput
  32. os.makedirs(lock_path.parent, exist_ok=True)
  33. with open(lock_path, "wb") as lock_fd:
  34. logger.info("Loading throughput info")
  35. fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX)
  36. # The OS will release the lock when lock_fd is closed or the process is killed
  37. cache_key = f"config_{sha256(str(config).encode()).hexdigest()[-16:]}"
  38. cache_key += f"_device_{get_device_name(device).replace(' ', '_')}"
  39. cache_key += f"_dtype_{get_dtype_name(dtype, load_in_8bit)}"
  40. cache = {}
  41. try:
  42. if not force_eval and os.path.exists(cache_path):
  43. with open(cache_path) as cache_fd:
  44. cache = json.load(cache_fd)
  45. assert isinstance(cache, dict)
  46. except Exception:
  47. logger.exception(f"Failed to read throughput info from {cache_path}")
  48. cache = {}
  49. if cache_key not in cache:
  50. cache[cache_key] = measure_throughput_info(config, device, dtype, load_in_8bit=load_in_8bit)
  51. try:
  52. os.makedirs(cache_path.parent, exist_ok=True)
  53. with open(cache_path, "w") as cache_fd:
  54. json.dump(cache, cache_fd)
  55. except Exception:
  56. logger.exception(f"Failed to save throughput info in {cache_path}")
  57. return cache[cache_key]
  58. def measure_throughput_info(
  59. config: BloomConfig,
  60. device: torch.device,
  61. dtype: torch.dtype,
  62. *,
  63. load_in_8bit: bool,
  64. ) -> float:
  65. """Measure network and compute throughput in forward pass tokens per second"""
  66. logger.info(
  67. "Measuring network and compute throughput. This takes about a minute and will be cached for future runs"
  68. )
  69. return min(
  70. measure_network_rps(config),
  71. measure_compute_rps(config, device, dtype, load_in_8bit=load_in_8bit),
  72. )
  73. def measure_network_rps(config: BloomConfig) -> float:
  74. proc = subprocess.run("python3 -m petals.cli.speed_test --json", shell=True, capture_output=True)
  75. if proc.returncode != 0:
  76. raise RuntimeError(f"Failed to measure network throughput (stdout: {proc.stdout}, stderr: {proc.stderr})")
  77. network_info = json.loads(proc.stdout)
  78. bits_per_request = config.hidden_size * 16 # Clients usually send 16-bit tensors for forward/backward
  79. network_rps = min(network_info["download"], network_info["upload"]) / bits_per_request
  80. logger.info(
  81. f"Network throughput: "
  82. f"{network_info['download'] / 1e6:.2f} Mbit/s on download, "
  83. f"{network_info['upload'] / 1e6:.2f} Mbit/s on upload, "
  84. f"{network_rps:.1f} RPS"
  85. )
  86. return network_rps
  87. def measure_compute_rps(
  88. config: BloomConfig,
  89. device: torch.device,
  90. dtype: torch.dtype,
  91. *,
  92. load_in_8bit: bool,
  93. n_tokens: int = 16,
  94. n_steps: int = 500,
  95. ) -> float:
  96. with torch.inference_mode():
  97. block = WrappedBloomBlock(config).to(dtype)
  98. if load_in_8bit:
  99. block = replace_8bit_linear(block)
  100. block = block.to(device)
  101. cache = None
  102. elapsed = 0
  103. for step in range(n_steps + 1):
  104. dummy_input = torch.randn(n_tokens, 1, config.hidden_size, device=device, dtype=dtype)
  105. start_time = time.perf_counter()
  106. _, cache = block.forward(dummy_input, use_cache=True, layer_past=cache)
  107. if step >= 1: # Skip the 1st step to exclude the initialization time
  108. elapsed += time.perf_counter() - start_time
  109. device_rps = n_steps * n_tokens / elapsed
  110. logger.info(
  111. f"Forward pass throughput ({get_device_name(device)}, {get_dtype_name(dtype, load_in_8bit)}): "
  112. f"{device_rps:.1f} RPS"
  113. )
  114. return device_rps
  115. def get_device_name(device: torch.device) -> str:
  116. return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else "CPU"
  117. def get_dtype_name(dtype: torch.dtype, load_in_8bit: bool) -> str:
  118. return "8-bit" if load_in_8bit else str(dtype)