2 年之前 · 30e3f4a6b4
--- a/.github/workflows/run-tests.yaml
+++ b/.github/workflows/run-tests.yaml
@@ -33,6 +33,7 @@ jobs:
 
															         run: |
														
 
															           export MODEL_NAME=bigscience/bloom-560m
														
 
															           export REF_NAME=bigscience/bloom-560m
														
 
															+          export ADAPTER_NAME=artek0chumak/bloom-560m-safe-peft
														
 
															           python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:12 \
														
 
															             --new_swarm --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 --throughput 1 \
														
@@ -58,11 +59,14 @@ jobs:
 
															             --initial_peers $INITIAL_PEERS --throughput 1 --torch_dtype float32 --tensor_parallel_devices cpu cpu &> server4.log &
														
 
															           SERVER4_PID=$!
														
 
															+          python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:24             --new_swarm --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 --throughput 1             --torch_dtype float32 --compression NONE --attn_cache_tokens 2048 --adapters $ADAPTER_NAME &> server5.log &
														
 
															+          SERVER5_PID=$!
														
 
															+
														
 
															           tail -n 100 -f server*.log &
														
 
															           LOGGER_PID=$!
														
 
															           sleep 30  # wait for servers to download layers
														
 
															-          kill -0 $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all servers survived init
														
 
															+          kill -0 $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $SERVER5_PID # ensure all servers survived init
														
 
															           pytest tests --durations=0 --durations-min=1.0 -v
														
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@@ -146,6 +146,8 @@ def main():
 
															                         help="Skip checking this server's reachability via health.petals.ml "
														
 
															                              "when connecting to the public swarm. If you connect to a private swarm, "
														
 
															                              "the check is skipped by default. Use this option only if you know what you are doing")
														
 
															+    
														
 
															+    parser.add_argument("--adapters", nargs='+', default=None, help="List of pretrained LoRA adapters that can be used for inference or training.")
														
 
															     # fmt:on
														
 
															     args = vars(parser.parse_args())
														
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -81,6 +81,7 @@ class Server:
 
															         dht_client_mode: Optional[bool] = None,
														
 
															         use_relay: bool = True,
														
 
															         use_auto_relay: bool = True,
														
 
															+        adapters: Optional[List[str]] = None,
														
 
															         **kwargs,
														
 
															     ):
														
 
															         """Create a server with one or more bloom blocks. See run_server.py for documentation."""
														
@@ -217,6 +218,8 @@ class Server:
 
															         self.balance_quality = balance_quality
														
 
															         self.mean_balance_check_period = mean_balance_check_period
														
 
															         self.mean_block_selection_delay = mean_block_selection_delay
														
 
															+        
														
 
															+        self.adapters = adapters
														
 
															         self.stop = threading.Event()
														
@@ -291,6 +294,7 @@ class Server:
 
															                 quant_type=self.quant_type,
														
 
															                 tensor_parallel_devices=self.tensor_parallel_devices,
														
 
															                 should_validate_reachability=self.should_validate_reachability,
														
 
															+                adapters=self.adapters,
														
 
															                 start=True,
														
 
															             )
														
 
															             try:
														
@@ -384,6 +388,7 @@ class ModuleContainer(threading.Thread):
 
															         quant_type: QuantType,
														
 
															         tensor_parallel_devices: Sequence[torch.device],
														
 
															         should_validate_reachability: bool,
														
 
															+        adapters: Optional[List[str]] = None,
														
 
															         **kwargs,
														
 
															     ) -> ModuleContainer:
														
 
															         module_uids = [f"{dht_prefix}{UID_DELIMITER}{block_index}" for block_index in block_indices]
														
@@ -415,7 +420,12 @@ class ModuleContainer(threading.Thread):
 
															                     cache_dir=cache_dir,
														
 
															                     max_disk_space=max_disk_space,
														
 
															                 )
														
 
															-                block = convert_block(block, block_config, tensor_parallel_devices, device, quant_type, freeze=True)
														
 
															+                block = convert_block(
														
 
															+                    block, block_index, block_config, tensor_parallel_devices, device, quant_type, adapters=adapters, freeze=True,
														
 
															+                    use_auth_token=use_auth_token,
														
 
															+                    cache_dir=cache_dir,
														
 
															+                    max_disk_space=max_disk_space,
														
 
															+                )
														
 
															                 blocks[module_uid] = TransformerBackend(
														
 
															                     module_uid,
														
 
															                     block,
														
--- a/src/petals/utils/convert_block.py
+++ b/src/petals/utils/convert_block.py
@@ -4,13 +4,13 @@ Tools for converting transformer blocks, applying quantization and/or tensor par
 
															 import os
														
 
															 import re
														
 
															 from enum import Enum
														
 
															-from typing import Sequence
														
 
															+from typing import List, Optional, Sequence
														
 
															 import tensor_parallel as tp
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															-from peft import create_lora_adapter, add_adapter_to_block, load_peft
														
 
															+from petals.utils.peft import create_lora_adapter, add_adapter_to_block, load_peft
														
 
															 from tensor_parallel.slicing_configs import get_bloom_config
														
 
															 from transformers import PretrainedConfig
														
@@ -26,12 +26,14 @@ class QuantType(Enum):
 
															 def convert_block(
														
 
															     block: nn.Module,
														
 
															+    block_index: int,
														
 
															     config: PretrainedConfig,
														
 
															     tensor_parallel_devices: Sequence[torch.device],
														
 
															     output_device: torch.device,
														
 
															     quant_type: QuantType,
														
 
															     freeze: bool = True,
														
 
															     adapters: Optional[List[str]] = None,
														
 
															+    **kwargs,
														
 
															 ) -> tp.TensorParallel:
														
 
															     """
														
 
															     Optimize a transformer block for use in a Petals server, apply tensor parallelism and/or LLM.8bit quantization
														
@@ -57,12 +59,16 @@ def convert_block(
 
															     for shard, device in zip(block.module_shards, block.devices):
														
 
															         shard.to(device)
														
 
															-        
														
 
															+
														
 
															     if adapters:
														
 
															         create_lora_adapter(block)
														
 
															-        for adapter in adapters:
														
 
															-            adapter_config, adapter_state_dict = load_peft(adapter)
														
 
															-            add_adapter_to_block(block, adapter_config, adapter_state_dict)
														
 
															+        for adapter_name in adapters:
														
 
															+            adapter_config, adapter_state_dict = load_peft(
														
 
															+                adapter_name,
														
 
															+                block_idx=block_index,
														
 
															+                **kwargs,
														
 
															+            )
														
 
															+            add_adapter_to_block(block, block_index, adapter_name, adapter_config, adapter_state_dict)
														
 
															     return block
														
--- a/src/petals/utils/peft.py
+++ b/src/petals/utils/peft.py
@@ -1,9 +1,14 @@
 
															+import re
														
 
															 import time
														
 
															 from typing import List, Optional
														
 
															+import torch.nn as nn
														
 
															+import bitsandbytes as bnb
														
 
															+
														
 
															 from hivemind.utils.logging import get_logger
														
 
															 from huggingface_hub import HfFileSystem, get_hf_file_metadata, hf_hub_url
														
 
															-from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, PeftConfig
														
 
															+from peft.tuners import lora
														
 
															+from peft.utils import COMMON_LAYERS_PATTERN, CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, PeftConfig
														
 
															 from safetensors import safe_open
														
 
															 from safetensors.torch import load_file
														
 
															 from transformers.utils import get_file_from_repo
														
@@ -19,23 +24,22 @@ def check_peft_repository(repo_id: str) -> bool:
 
															     return len(list_of_files) > 0
														
 
															-def load_specific_module(layers_name: List[str], filepath: str, framework: str = "pt", device: Optional[int] = None):
														
 
															+def load_specific_module(block_idx: int, filepath: str, framework: str = "pt", device: Optional[int] = None):
														
 
															     tensors = dict()
														
 
															     is_tensors_found = dict()
														
 
															+    common_layer_patter_re = ".+\." + "".join(f"({common_name})?" for common_name in COMMON_LAYERS_PATTERN) + f"({block_idx})?\.0\..+"
														
 
															     with safe_open(filepath, framework=framework, device=device) as f:
														
 
															         for k in f.keys():
														
 
															-            for layer_name in layers_name:
														
 
															-                if k.startswith(layer_name):
														
 
															-                    is_tensors_found[layer_name] = True
														
 
															-                    tensors[k] = f.get_tensor(k)
														
 
															-        for layer_name in layers_name:
														
 
															-            if not is_tensors_found.get(layer_name, False):
														
 
															-                logger.warning(f"There is no peft weights with prefix {layer_name}")
														
 
															+            if re.match(common_layer_patter_re, k):
														
 
															+                is_tensors_found[block_idx] = True
														
 
															+                tensors[k] = f.get_tensor(k)
														
 
															+        if not is_tensors_found.get(block_idx, False):
														
 
															+            logger.warning(f"There is no peft weights for block {block_idx}")
														
 
															         return tensors
														
 
															 def get_adapter_from_repo(
														
 
															-    repo_id: str, layers_name: Optional[List[str]] = None, device: Optional[int] = None, **kwargs
														
 
															+    repo_id: str, block_idx: Optional[int] = None, device: Optional[int] = None, **kwargs
														
 
															 ):
														
 
															     config_path = get_file_from_repo(repo_id, CONFIG_NAME, **kwargs)
														
 
															     if config_path is None:
														
@@ -45,14 +49,14 @@ def get_adapter_from_repo(
 
															     weight_path = get_file_from_repo(repo_id, SAFETENSORS_WEIGHTS_NAME, **kwargs)
														
 
															     if weight_path is None:
														
 
															         raise RuntimeError(f"File {SAFETENSORS_WEIGHTS_NAME} does not exist in repo {repo_id}")
														
 
															-    if layers_name is None:
														
 
															+    if block_idx is None:
														
 
															         return config, load_file(weight_path)
														
 
															-    return config, load_specific_module(layers_name, weight_path, device=device)
														
 
															+    return config, load_specific_module(block_idx, weight_path, device=device)
														
 
															 def load_peft(
														
 
															     repo_id: str,
														
 
															-    layers_name: Optional[List[str]] = None,
														
 
															+    block_idx: Optional[int] = None,
														
 
															     device: Optional[int] = None,
														
 
															     *,
														
 
															     revision: Optional[str] = None,
														
@@ -70,7 +74,7 @@ def load_peft(
 
															         with allow_cache_reads(cache_dir):
														
 
															             return get_adapter_from_repo(
														
 
															                 repo_id,
														
 
															-                layers_name,
														
 
															+                block_idx,
														
 
															                 device,
														
 
															                 revision=revision,
														
 
															                 use_auth_token=use_auth_token,
														
@@ -96,7 +100,7 @@ def load_peft(
 
															                 return get_adapter_from_repo(
														
 
															                     repo_id,
														
 
															-                    layers_name,
														
 
															+                    block_idx,
														
 
															                     device,
														
 
															                     revision=revision,
														
 
															                     use_auth_token=use_auth_token,
														
@@ -115,8 +119,8 @@ def create_lora_adapter(block):
 
															         for child_name, child in module.named_children():
														
 
															             lora_wrapped_child = None
														
 
															             if isinstance(child, nn.Linear):
														
 
															-                bias = hasattr(target, "bias") and target.bias is not None
														
 
															-                lora_wrapped_child = peft.tuners.lora.Linear(
														
 
															+                bias = hasattr(child, "bias") and child.bias is not None
														
 
															+                lora_wrapped_child = lora.Linear(
														
 
															                     child_name,
														
 
															                     child.in_features,
														
 
															                     child.out_features,
														
@@ -128,9 +132,9 @@ def create_lora_adapter(block):
 
															                     "memory_efficient_backward": child.state.memory_efficient_backward,
														
 
															                     "threshold": child.state.threshold,
														
 
															                     "index": child.index,
														
 
															-                    "bias": hasattr(target, "bias") and target.bias is not None,
														
 
															+                    "bias": hasattr(child, "bias") and child.bias is not None,
														
 
															                 }
														
 
															-                lora_wrapped_child = peft.tuners.lora.Linear8bitLt(
														
 
															+                lora_wrapped_child = lora.Linear8bitLt(
														
 
															                     child_name,
														
 
															                     child.in_features,
														
 
															                     child.out_features,
														
@@ -141,9 +145,9 @@ def create_lora_adapter(block):
 
															                     "compute_dtype": child.compute_dtype,
														
 
															                     "compress_statistics": child.weight.compress_statistics,
														
 
															                     "quant_type": child.weight.quant_type,
														
 
															-                    "bias": hasattr(target, "bias") and target.bias is not None,
														
 
															+                    "bias": hasattr(child, "bias") and child.bias is not None,
														
 
															                 }
														
 
															-                lora_wrapped_child = peft.tuners.lora.Linear4bit(
														
 
															+                lora_wrapped_child = lora.Linear4bit(
														
 
															                     child_name,
														
 
															                     child.in_features,
														
 
															                     child.out_features,
														
@@ -151,9 +155,39 @@ def create_lora_adapter(block):
 
															                 )
														
 
															             if lora_wrapped_child:
														
 
															                 lora_wrapped_child.active_adapter = None
														
 
															+                for p in lora_wrapped_child.parameters():
														
 
															+                    p.requires_grad = False
														
 
															                 setattr(module, child_name, lora_wrapped_child)
														
 
															-def add_adapter_to_block(block, peft_config, peft_state_dict):
														
 
															-    assert peft_config.peft_type == peft.PeftType.LORA, "Petals works only with LORA adapters"
														
 
															-    pass
														
 
															+def add_adapter_to_block(block, block_index, adapter_name, peft_config, peft_state_dict):
														
 
															+    assert peft_config["peft_type"] == "LORA", "Petals works only with LORA adapters"
														
 
															+    for name, module in block.named_modules():
														
 
															+        for child_name, child in module.named_children():
														
 
															+            if not isinstance(child, (lora.Linear, lora.Linear8bitLt, lora.Linear4bit)):
														
 
															+                continue
														
 
															+
														
 
															+            if child_name in peft_config["target_modules"] or (isinstance(peft_config["target_modules"], str) and re.fullmatch(peft_config["target_modules"], child_name)):
														
 
															+                is_lora_a_loaded = False
														
 
															+                is_lora_b_loaded = False
														
 
															+                for peft_key in peft_state_dict:
														
 
															+                    if adapter_name not in child.lora_A:
														
 
															+                        child.update_layer(
														
 
															+                            adapter_name,
														
 
															+                            peft_config["r"],
														
 
															+                            peft_config["lora_alpha"],
														
 
															+                            peft_config["lora_dropout"],
														
 
															+                            peft_config["init_lora_weights"],
														
 
															+                        )
														
 
															+                        for p in child.parameters():
														
 
															+                            p.requires_grad = False
														
 
															+
														
 
															+                    if "lora_A" in peft_key:
														
 
															+                        child.lora_A[adapter_name].weight.data = peft_state_dict[peft_key] * child.scaling[adapter_name]
														
 
															+                        is_lora_a_loaded = True
														
 
															+                    elif "lora_B" in peft_key:
														
 
															+                        child.lora_B[adapter_name].weight.data = peft_state_dict[peft_key]
														
 
															+                        is_lora_b_loaded = True
														
 
															+                        
														
 
															+                if is_lora_a_loaded and is_lora_b_loaded:
														
 
															+                    logger.info(f"Loading {adapter_name} for block {block_index} is ended successfully")