2 ani în urmă · 84bcc8090c
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,6 @@ install_requires =
 
				     tensor_parallel==1.0.23
			
 
				     humanfriendly
			
 
				     async-timeout>=4.0.2
			
 
				-    cpufeature>=0.2.0
			
 
				     packaging>=20.9
			
 
				 
			
 
				 [options.extras_require]
			
--- a/src/petals/bloom/modeling_utils.py
+++ b/src/petals/bloom/modeling_utils.py
@@ -29,16 +29,6 @@ class LMHead(nn.Module):
 
				         self.word_embeddings = word_embeddings
			
 
				 
			
 
				         self.use_chunked_forward = config.use_chunked_forward
			
 
				-        if self.use_chunked_forward == "auto":
			
 
				-            if platform.machine() == "x86_64":
			
 
				-                # Import of cpufeature may crash on non-x86_64 machines
			
 
				-                from cpufeature import CPUFeature
			
 
				-
			
 
				-                # If the CPU supports AVX512, plain bfloat16 is ~10x faster than chunked_forward().
			
 
				-                # Otherwise, it's ~8x slower.
			
 
				-                self.use_chunked_forward = not (CPUFeature["AVX512f"] and CPUFeature["OS_AVX512"])
			
 
				-            else:
			
 
				-                self.use_chunked_forward = True
			
 
				         self.chunked_forward_step = config.chunked_forward_step
			
 
				         self._bf16_warning_shown = False
			
 
				 
			
--- a/src/petals/client/remote_model.py
+++ b/src/petals/client/remote_model.py
@@ -44,8 +44,7 @@ class DistributedBloomConfig(BloomConfig):
 
				     tuning_mode: Optional[str] = None  # One of the finetune options: [None, 'shallow_ptune', 'deep_ptune', 'adapters']
			
 
				 
			
 
				     # This settings matter for running the client with dtype bfloat16 on CPU.
			
 
				-    # If the CPU doesn't support AVX512, chunked_forward() significantly speeds up computations.
			
 
				-    use_chunked_forward: Union[str, bool] = "auto"
			
 
				+    use_chunked_forward: bool = True
			
 
				     chunked_forward_step: int = 16384