2 vuotta sitten · 701ec7e53e
--- a/src/petals/bloom/from_pretrained.py
+++ b/src/petals/bloom/from_pretrained.py
@@ -8,6 +8,8 @@ If necessary, one can rewrite this to implement a different behavior, such as:
 
															 """
														
 
															 from __future__ import annotations
														
 
															+import itertools
														
 
															+import time
														
 
															 from typing import Optional, OrderedDict, Union
														
 
															 import torch
														
@@ -17,7 +19,8 @@ from transformers.models.bloom.configuration_bloom import BloomConfig
 
															 from transformers.utils import get_file_from_repo
														
 
															 from petals.bloom.block import WrappedBloomBlock
														
 
															-from petals.utils.disk_cache import DEFAULT_CACHE_DIR
														
 
															+from petals.server.block_utils import get_block_size
														
 
															+from petals.utils.disk_cache import DEFAULT_CACHE_DIR, allow_cache_reads, allow_cache_writes, free_disk_space_for
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
@@ -33,6 +36,7 @@ def load_pretrained_block(
 
															     torch_dtype: Union[torch.dtype, str] = "auto",
														
 
															     use_auth_token: Optional[str] = None,
														
 
															     cache_dir: Optional[str] = None,
														
 
															+    max_disk_space: Optional[int] = None,
														
 
															 ) -> WrappedBloomBlock:
														
 
															     """Load one BLOOM block from a converted model. See convert_model.py (or README.md) on how to convert it."""
														
@@ -43,7 +47,12 @@ def load_pretrained_block(
 
															     block = WrappedBloomBlock(config)
														
 
															     state_dict = _load_state_dict(
														
 
															-        converted_model_name_or_path, block_index, use_auth_token=use_auth_token, cache_dir=cache_dir
														
 
															+        converted_model_name_or_path,
														
 
															+        block_index,
														
 
															+        config,
														
 
															+        use_auth_token=use_auth_token,
														
 
															+        cache_dir=cache_dir,
														
 
															+        max_disk_space=max_disk_space,
														
 
															     )
														
 
															     if torch_dtype == "auto":
														
@@ -62,20 +71,56 @@ def load_pretrained_block(
 
															 def _load_state_dict(
														
 
															     pretrained_model_name_or_path: str,
														
 
															-    block_index: Optional[int] = None,
														
 
															+    block_index: int,
														
 
															+    config: BloomConfig,
														
 
															+    *,
														
 
															     use_auth_token: Optional[str] = None,
														
 
															-    cache_dir: Optional[str] = None,
														
 
															+    cache_dir: str,
														
 
															+    max_disk_space: Optional[int] = None,
														
 
															+    min_backoff: float = 5,
														
 
															 ) -> OrderedDict[str, torch.Tensor]:
														
 
															-    revision = BLOCK_BRANCH_PREFIX + str(block_index) if block_index is not None else CLIENT_BRANCH
														
 
															-    archive_file = get_file_from_repo(
														
 
															-        pretrained_model_name_or_path,
														
 
															-        filename=WEIGHTS_NAME,
														
 
															-        revision=revision,
														
 
															-        use_auth_token=use_auth_token,
														
 
															-        cache_dir=cache_dir,
														
 
															-    )
														
 
															-    state_dict = torch.load(archive_file, map_location="cpu")
														
 
															-    return state_dict
														
 
															+    revision = BLOCK_BRANCH_PREFIX + str(block_index)
														
 
															+
														
 
															+    # First, try to find the weights locally
														
 
															+    try:
														
 
															+        with allow_cache_reads(cache_dir):
														
 
															+            archive_file = get_file_from_repo(
														
 
															+                pretrained_model_name_or_path,
														
 
															+                filename=WEIGHTS_NAME,
														
 
															+                revision=revision,
														
 
															+                use_auth_token=use_auth_token,
														
 
															+                cache_dir=cache_dir,
														
 
															+                local_files_only=True,
														
 
															+            )
														
 
															+            if archive_file is not None:
														
 
															+                return torch.load(archive_file, map_location="cpu")
														
 
															+    except Exception:
														
 
															+        logger.debug(
														
 
															+            f"Failed to load block {block_index} from cache. The block will be downloaded again", exc_info=True
														
 
															+        )
														
 
															+
														
 
															+    # If not found, ensure that we have enough disk space to download them (maybe remove something)
														
 
															+    for attempt_no in itertools.count():
														
 
															+        try:
														
 
															+            with allow_cache_writes(cache_dir):
														
 
															+                block_size = get_block_size(config, "disk")
														
 
															+                free_disk_space_for(
														
 
															+                    pretrained_model_name_or_path, block_size, cache_dir=cache_dir, max_disk_space=max_disk_space
														
 
															+                )
														
 
															+
														
 
															+                archive_file = get_file_from_repo(
														
 
															+                    pretrained_model_name_or_path,
														
 
															+                    filename=WEIGHTS_NAME,
														
 
															+                    revision=revision,
														
 
															+                    use_auth_token=use_auth_token,
														
 
															+                    cache_dir=cache_dir,
														
 
															+                    local_files_only=False,
														
 
															+                )
														
 
															+                return torch.load(archive_file, map_location="cpu")
														
 
															+        except Exception as e:
														
 
															+            delay = min_backoff * (2**attempt_no)
														
 
															+            logger.warning(f"Failed to load block {block_index} from HF Hub (retry in {delay:.0f} sec)", exc_info=True)
														
 
															+            time.sleep(delay)
														
 
															 DTYPE_MAP = dict(bfloat16=torch.bfloat16, float16=torch.float16, float32=torch.float32, auto="auto")
														
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@@ -47,8 +47,18 @@ def main():
 
															                         help='Use this many threads to pass results/exceptions from Runtime to Pools')
														
 
															     parser.add_argument('--inference_max_length', type=int, default=2048,
														
 
															                         help='Maximum total sequence length permitted per inference, defaults to 16384 tokens')
														
 
															+
														
 
															     parser.add_argument('--cache_dir', type=str, default=None,
														
 
															                         help='Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.')
														
 
															+    parser.add_argument("--max_disk_space", type=str, default=None,
														
 
															+                        help="Maximal disk space used for caches. Example: 50GB, 100GiB (GB != GiB here). "
														
 
															+                             "Default: unlimited. "
														
 
															+                             "For bigscience/bloom-petals, this default means that the server may use up to "
														
 
															+                             "min(free_disk_space, 350GB) in the worst case, which happens when the server runs "
														
 
															+                             "for a long time and caches all model blocks after a number of rebalancings. "
														
 
															+                             "However, this worst case is unlikely, expect the server to consume "
														
 
															+                             "the disk space equal to 2-4x of your GPU memory on average.")
														
 
															+
														
 
															     parser.add_argument('--device', type=str, default=None, required=False,
														
 
															                         help='all blocks will use this device in torch notation; default: cuda if available else cpu')
														
 
															     parser.add_argument("--torch_dtype", type=str, default="auto",
														
@@ -129,7 +139,14 @@ def main():
 
															         attn_cache_size = parse_size(attn_cache_size)
														
 
															     assert isinstance(
														
 
															         attn_cache_size, (int, type(None))
														
 
															-    ), "unrecognized value for attention_cache_bytes, examples: 1.5GB or 1500MB or 1572864000 (bytes)"
														
 
															+    ), "Unrecognized value for --attn_cache_size. Correct examples: 1.5GB or 1500MB or 1572864000 (bytes)"
														
 
															+
														
 
															+    max_disk_space = args.pop("max_disk_space")
														
 
															+    if max_disk_space is not None:
														
 
															+        max_disk_space = parse_size(max_disk_space)
														
 
															+    assert isinstance(
														
 
															+        max_disk_space, (int, type(None))
														
 
															+    ), "Unrecognized value for --max_disk_space. Correct examples: 1.5GB or 1500MB or 1572864000 (bytes)"
														
 
															     if args.pop("new_swarm"):
														
 
															         args["initial_peers"] = []
														
@@ -138,7 +155,7 @@ def main():
 
															     if load_in_8bit is not None:
														
 
															         args["load_in_8bit"] = load_in_8bit.lower() in ["true", "1"]
														
 
															-    server = Server(**args, compression=compression, attn_cache_size=attn_cache_size)
														
 
															+    server = Server(**args, compression=compression, max_disk_space=max_disk_space, attn_cache_size=attn_cache_size)
														
 
															     try:
														
 
															         server.run()
														
 
															     except KeyboardInterrupt:
														
--- a/src/petals/server/handler.py
+++ b/src/petals/server/handler.py
@@ -54,6 +54,14 @@ class TransformerConnectionHandler(ConnectionHandler):
 
															         self.session_timeout, self.step_timeout = session_timeout, step_timeout
														
 
															         self._prioritizer = task_prioritizer
														
 
															+    def shutdown(self):
														
 
															+        if self.is_alive():
														
 
															+            self._outer_pipe.send("_shutdown")
														
 
															+            self.join(self.shutdown_timeout)
														
 
															+            if self.is_alive():
														
 
															+                logger.warning(f"{self.__class__.__name__} failed to shut down gracefully, sending SIGTERM")
														
 
															+                self.terminate()
														
 
															+
														
 
															     async def _gather_inputs(
														
 
															         self, requests: AsyncIterator[runtime_pb2.ExpertRequest], context: P2PContext
														
 
															     ) -> Tuple[str, List[torch.Tensor], Dict]:
														
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -29,6 +29,7 @@ from petals.server.handler import TransformerConnectionHandler
 
															 from petals.server.memory_cache import MemoryCache
														
 
															 from petals.server.throughput import get_host_throughput
														
 
															 from petals.utils.convert_8bit import replace_8bit_linear
														
 
															+from petals.utils.disk_cache import DEFAULT_CACHE_DIR
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
@@ -56,6 +57,7 @@ class Server:
 
															         torch_dtype: str = "auto",
														
 
															         revision: str = "main",
														
 
															         cache_dir: Optional[str] = None,
														
 
															+        max_disk_space: Optional[int] = None,
														
 
															         attn_cache_size: Optional[int] = None,
														
 
															         alloc_timeout: float = 60,
														
 
															         device: Optional[Union[str, torch.device]] = None,
														
@@ -82,7 +84,6 @@ class Server:
 
															         self.num_handlers = num_handlers
														
 
															         self.min_batch_size, self.max_batch_size = min_batch_size, max_batch_size
														
 
															         self.inference_max_length = inference_max_length
														
 
															-        self.cache_dir = cache_dir
														
 
															         self.compression = compression
														
 
															         self.stats_report_interval, self.update_period = stats_report_interval, update_period
														
 
															         self.prefetch_batches, self.sender_threads = prefetch_batches, sender_threads
														
@@ -117,7 +118,8 @@ class Server:
 
															         self.dht = DHT(initial_peers=initial_peers, start=True, num_workers=self.block_config.n_layer, **kwargs)
														
 
															         visible_maddrs_str = [str(a) for a in self.dht.get_visible_maddrs()]
														
 
															         if initial_peers == PUBLIC_INITIAL_PEERS:
														
 
															-            logger.info("Connecting to the public Petals swarm")
														
 
															+            logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}")
														
 
															+            logger.info("Please check that your server is reachable at http://health.petals.ml")
														
 
															         else:
														
 
															             logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
														
@@ -158,6 +160,11 @@ class Server:
 
															         logger.info(f"Attention cache for all blocks will consume up to {attn_cache_size / gib:.2f} GiB")
														
 
															         self.memory_cache = MemoryCache(device, attn_cache_size, alloc_timeout)
														
 
															+        if cache_dir is None:
														
 
															+            cache_dir = DEFAULT_CACHE_DIR
														
 
															+        self.cache_dir = cache_dir
														
 
															+        self.max_disk_space = max_disk_space
														
 
															+
														
 
															         assert isinstance(throughput, float) or throughput in ["auto", "eval"]
														
 
															         if throughput in ["auto", "eval"]:
														
 
															             throughput = get_host_throughput(
														
@@ -213,6 +220,7 @@ class Server:
 
															                 inference_max_length=self.inference_max_length,
														
 
															                 torch_dtype=self.torch_dtype,
														
 
															                 cache_dir=self.cache_dir,
														
 
															+                max_disk_space=self.max_disk_space,
														
 
															                 device=self.device,
														
 
															                 compression=self.compression,
														
 
															                 stats_report_interval=self.stats_report_interval,
														
@@ -308,7 +316,8 @@ class ModuleContainer(threading.Thread):
 
															         min_batch_size: int,
														
 
															         max_batch_size: int,
														
 
															         torch_dtype: torch.dtype,
														
 
															-        cache_dir: Optional[str],
														
 
															+        cache_dir: str,
														
 
															+        max_disk_space: int,
														
 
															         device: Union[str, torch.device],
														
 
															         compression: CompressionType,
														
 
															         update_period: float,
														
@@ -340,6 +349,7 @@ class ModuleContainer(threading.Thread):
 
															                     torch_dtype=torch_dtype,
														
 
															                     use_auth_token=use_auth_token,
														
 
															                     cache_dir=cache_dir,
														
 
															+                    max_disk_space=max_disk_space,
														
 
															                 )
														
 
															                 if load_in_8bit:
														
--- a/src/petals/utils/disk_cache.py
+++ b/src/petals/utils/disk_cache.py
@@ -1,4 +1,86 @@
 
															+import fcntl
														
 
															 import os
														
 
															+import shutil
														
 
															+from contextlib import contextmanager
														
 
															 from pathlib import Path
														
 
															+from typing import Optional
														
 
															+
														
 
															+import huggingface_hub
														
 
															+from hivemind.utils.logging import get_logger
														
 
															+
														
 
															+logger = get_logger(__file__)
														
 
															 DEFAULT_CACHE_DIR = os.getenv("PETALS_CACHE", Path(Path.home(), ".cache", "petals"))
														
 
															+
														
 
															+BLOCKS_LOCK_FILE = "blocks.lock"
														
 
															+
														
 
															+
														
 
															+@contextmanager
														
 
															+def _blocks_lock(cache_dir: Optional[str], mode: int):
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = DEFAULT_CACHE_DIR
														
 
															+    lock_path = Path(cache_dir, BLOCKS_LOCK_FILE)
														
 
															+
														
 
															+    os.makedirs(lock_path.parent, exist_ok=True)
														
 
															+    with open(lock_path, "wb") as lock_fd:
														
 
															+        fcntl.flock(lock_fd.fileno(), mode)
														
 
															+        # The OS will release the lock when lock_fd is closed or the process is killed
														
 
															+        yield
														
 
															+
														
 
															+
														
 
															+def allow_cache_reads(cache_dir: Optional[str]):
														
 
															+    """Allows simultaneous reads, guarantees that blocks won't be removed along the way (shared lock)"""
														
 
															+    return _blocks_lock(cache_dir, fcntl.LOCK_SH)
														
 
															+
														
 
															+
														
 
															+def allow_cache_writes(
														
 
															+    cache_dir: Optional[str], *, reserve: Optional[int] = None, max_disk_space: Optional[int] = None
														
 
															+):
														
 
															+    """Allows saving new blocks and removing the old ones (exclusive lock)"""
														
 
															+    return _blocks_lock(cache_dir, fcntl.LOCK_EX)
														
 
															+
														
 
															+
														
 
															+def free_disk_space_for(
														
 
															+    model_name: str,
														
 
															+    size: int,
														
 
															+    *,
														
 
															+    cache_dir: Optional[str],
														
 
															+    max_disk_space: Optional[int],
														
 
															+    os_quota: int = 1024**3,  # Minimal space we should leave to keep OS function normally
														
 
															+):
														
 
															+    if cache_dir is None:
														
 
															+        cache_dir = DEFAULT_CACHE_DIR
														
 
															+    cache_info = huggingface_hub.scan_cache_dir(cache_dir)
														
 
															+    model_repos = [repo for repo in cache_info.repos if repo.repo_type == "model" and repo.repo_id == model_name]
														
 
															+
														
 
															+    occupied_space = sum(repo.size_on_disk for repo in model_repos)
														
 
															+    available_space = shutil.disk_usage(cache_dir).free - os_quota
														
 
															+    if max_disk_space is not None:
														
 
															+        available_space = min(available_space, max_disk_space - occupied_space)
														
 
															+    if size <= available_space:
														
 
															+        return
														
 
															+
														
 
															+    revisions = [revision for repo in model_repos for revision in repo.revisions]
														
 
															+    revisions.sort(key=lambda rev: max([item.blob_last_accessed for item in rev.files], default=rev.last_modified))
														
 
															+
														
 
															+    # Remove as few least recently used blocks as possible
														
 
															+    pending_removal = []
														
 
															+    freed_space = 0
														
 
															+    extra_space_needed = size - available_space
														
 
															+    for rev in revisions:
														
 
															+        pending_removal.append(rev.commit_hash)
														
 
															+        freed_space += rev.size_on_disk
														
 
															+        if freed_space >= extra_space_needed:
														
 
															+            break
														
 
															+
														
 
															+    if pending_removal:
														
 
															+        gib = 1024**3
														
 
															+        logger.info(f"Removing {len(pending_removal)} blocks to free {freed_space / gib:.1f} GiB of disk space")
														
 
															+        delete_strategy = cache_info.delete_revisions(*pending_removal)
														
 
															+        delete_strategy.execute()
														
 
															+
														
 
															+    if freed_space < extra_space_needed:
														
 
															+        raise RuntimeError(
														
 
															+            f"Insufficient disk space to load a block. Please free {extra_space_needed - freed_space:.1f} GiB "
														
 
															+            f"on the volume for {cache_dir} or increase --max_disk_space if you set it manually"
														
 
															+        )