2 years ago · fa095f6461
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,7 +32,7 @@ packages = find:
 
				 python_requires = >=3.7
			
 
				 install_requires =
			
 
				     torch>=1.12
			
 
				-    bitsandbytes==0.39.1
			
 
				+    bitsandbytes==0.40.0.post3
			
 
				     accelerate>=0.16.0,<1.0.0
			
 
				     huggingface-hub>=0.11.1,<1.0.0
			
 
				     tokenizers>=0.13.3
			
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -166,7 +166,10 @@ class Server:
 
				             check_device_balance(self.tensor_parallel_devices)
			
 
				 
			
 
				         if quant_type is None:
			
 
				-            quant_type = QuantType.INT8 if device.type == "cuda" else QuantType.NONE
			
 
				+            if device.type == "cuda":
			
 
				+                quant_type = QuantType.NF4 if self.block_config.model_type == "llama" else QuantType.INT8
			
 
				+            else:
			
 
				+                quant_type = QuantType.NONE
			
 
				         self.quant_type = quant_type
			
 
				         logger.info(f"Model weights are loaded in {get_dtype_name(torch_dtype, quant_type)} format")