1 anno fa · 554779f654
--- a/src/petals/server/from_pretrained.py
+++ b/src/petals/server/from_pretrained.py
@@ -21,6 +21,7 @@ from huggingface_hub import get_hf_file_metadata, hf_hub_url
 
				 from huggingface_hub.utils import EntryNotFoundError
			
 
				 from transformers import PretrainedConfig, PreTrainedModel
			
 
				 from transformers.utils import get_file_from_repo
			
 
				+from transformers.quantizers import AutoHfQuantizer
			
 
				 
			
 
				 from petals.constants import DTYPE_MAP
			
 
				 from petals.models.mixtral import WrappedMixtralBlock
			
@@ -28,6 +29,7 @@ from petals.server.block_utils import get_model_block, resolve_block_dtype
 
				 from petals.utils.auto_config import AutoDistributedConfig
			
 
				 from petals.utils.disk_cache import DEFAULT_CACHE_DIR, allow_cache_reads, allow_cache_writes, free_disk_space_for
			
 
				 from petals.utils.hf_auth import always_needs_auth
			
 
				+from petals.utils.convert_block import is_gptq_quant
			
 
				 
			
 
				 logger = get_logger(__name__)
			
 
				 
			
@@ -55,6 +57,18 @@ def load_pretrained_block(
 
				         block = get_model_block(config, layer_idx=block_index)
			
 
				 
			
 
				     block_prefix = f"{config.block_prefix}.{block_index}."
			
 
				+
			
 
				+    if is_gptq_quant(config):
			
 
				+        hf_quantizer = AutoHfQuantizer.from_config(config.quantization_config, pre_quantized=True)
			
 
				+        hf_quantizer.optimum_quantizer.block_name_to_quantize = str(block_index)
			
 
				+        tmp_block_list = torch.nn.ModuleList([block])
			
 
				+        tmp_block_list.__class__.main_input_name = "input_ids"
			
 
				+        torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
			
 
				+        device_map = hf_quantizer.update_device_map("cuda")
			
 
				+        hf_quantizer.preprocess_model(
			
 
				+            model=tmp_block_list, device_map=device_map, keep_in_fp32_modules=False,
			
 
				+        )
			
 
				+
			
 
				     state_dict = _load_state_dict_from_repo(
			
 
				         model_name,
			
 
				         block_prefix,
			
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -33,7 +33,7 @@ from petals.server.memory_cache import MemoryCache
 
				 from petals.server.reachability import ReachabilityProtocol, check_direct_reachability, validate_reachability
			
 
				 from petals.server.throughput import get_dtype_name, get_server_throughput
			
 
				 from petals.utils.auto_config import AutoDistributedConfig
			
 
				-from petals.utils.convert_block import QuantType, check_device_balance, convert_block
			
 
				+from petals.utils.convert_block import QuantType, check_device_balance, convert_block, is_gptq_quant
			
 
				 from petals.utils.dht import declare_active_modules, get_remote_module_infos
			
 
				 from petals.utils.misc import get_size_in_bytes
			
 
				 from petals.utils.ping import PingAggregator
			
@@ -428,6 +428,8 @@ class Server:
 
				         self.dht.join()
			
 
				 
			
 
				 
			
 
				+
			
 
				+
			
 
				 class ModuleContainer(threading.Thread):
			
 
				     """Serves a set of specific Bloom layers for inference, forward, and backward. Announces itself over the DHT."""
			
 
				 
			
@@ -495,19 +497,20 @@ class ModuleContainer(threading.Thread):
 
				                     cache_dir=cache_dir,
			
 
				                     max_disk_space=max_disk_space,
			
 
				                 )
			
 
				-                block = convert_block(
			
 
				-                    block,
			
 
				-                    block_index,
			
 
				-                    block_config,
			
 
				-                    tensor_parallel_devices,
			
 
				-                    device,
			
 
				-                    quant_type,
			
 
				-                    adapters=server_info.adapters,
			
 
				-                    freeze=True,
			
 
				-                    token=token,
			
 
				-                    cache_dir=cache_dir,
			
 
				-                    max_disk_space=max_disk_space,
			
 
				-                )
			
 
				+                if not is_gptq_quant(block_config):
			
 
				+                    block = convert_block(
			
 
				+                        block,
			
 
				+                        block_index,
			
 
				+                        block_config,
			
 
				+                        tensor_parallel_devices,
			
 
				+                        device,
			
 
				+                        quant_type,
			
 
				+                        adapters=server_info.adapters,
			
 
				+                        freeze=True,
			
 
				+                        token=token,
			
 
				+                        cache_dir=cache_dir,
			
 
				+                        max_disk_space=max_disk_space,
			
 
				+                    )
			
 
				                 blocks[module_uid] = TransformerBackend(
			
 
				                     module_uid,
			
 
				                     block,
			
@@ -554,6 +557,7 @@ class ModuleContainer(threading.Thread):
 
				             **kwargs,
			
 
				         )
			
 
				 
			
 
				+
			
 
				     def __init__(
			
 
				         self,
			
 
				         dht: DHT,
			
--- a/src/petals/utils/convert_block.py
+++ b/src/petals/utils/convert_block.py
@@ -21,6 +21,9 @@ class QuantType(Enum):
 
				     INT8 = 1  # 8-bit as in the LLM.int8() paper
			
 
				     NF4 = 2  # 4-bit as in the QLoRA paper
			
 
				 
			
 
				+def is_gptq_quant(config):
			
 
				+    return hasattr(config, 'quantization_config') and hasattr(config.quantization_config,
			
 
				+                                                                  "quant_method") and config.quantization_config.quant_method == "gptq"
			
 
				 
			
 
				 def convert_block(
			
 
				     block: nn.Module,