1 year ago · 19e851567d
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -497,20 +497,20 @@ class ModuleContainer(threading.Thread):
 
															                     cache_dir=cache_dir,
														
 
															                     max_disk_space=max_disk_space,
														
 
															                 )
														
 
															-                if not is_gptq_quant(block_config):
														
 
															-                    block = convert_block(
														
 
															-                        block,
														
 
															-                        block_index,
														
 
															-                        block_config,
														
 
															-                        tensor_parallel_devices,
														
 
															-                        device,
														
 
															-                        quant_type,
														
 
															-                        adapters=server_info.adapters,
														
 
															-                        freeze=True,
														
 
															-                        token=token,
														
 
															-                        cache_dir=cache_dir,
														
 
															-                        max_disk_space=max_disk_space,
														
 
															-                    )
														
 
															+                # if not is_gptq_quant(block_config):
														
 
															+                block = convert_block(
														
 
															+                    block,
														
 
															+                    block_index,
														
 
															+                    block_config,
														
 
															+                    tensor_parallel_devices,
														
 
															+                    device,
														
 
															+                    quant_type,
														
 
															+                    adapters=server_info.adapters,
														
 
															+                    freeze=True,
														
 
															+                    token=token,
														
 
															+                    cache_dir=cache_dir,
														
 
															+                    max_disk_space=max_disk_space,
														
 
															+                )
														
 
															                 blocks[module_uid] = TransformerBackend(
														
 
															                     module_uid,
														
 
															                     block,
														
--- a/src/petals/utils/convert_block.py
+++ b/src/petals/utils/convert_block.py
@@ -54,13 +54,14 @@ def convert_block(
 
															     block = make_tensor_parallel(block, config, tensor_parallel_devices, output_device=output_device)
														
 
															-    if quant_type != QuantType.NONE:
														
 
															+    if quant_type != QuantType.NONE and not is_gptq_quant(config):
														
 
															+        print("I'm still quantizing ")
														
 
															         block = quantize_module(block, quant_type=quant_type)
														
 
															     for shard, device in zip(block.module_shards, block.devices):
														
 
															         shard.to(device)
														
 
															-    if adapters:
														
 
															+    if adapters and not is_gptq_quant(config):
														
 
															         from petals.utils.peft import add_adapter_to_block, create_lora_adapter, load_peft
														
 
															         create_lora_adapter(block)