Selaa lähdekoodia

Update transformers to 4.41.2 (#583)

* updated transformers lib to 4.41.2

* fix all versions ranges

* fix _seen_tokens

* downgrade numpy

* seq_len fix
Anton Sinitsin 1 vuosi sitten
vanhempi
commit
68585864ae

+ 2 - 1
setup.cfg

@@ -37,7 +37,7 @@ install_requires =
     accelerate>=0.27.2
     huggingface-hub>=0.11.1,<1.0.0
     tokenizers>=0.13.3
-    transformers==4.38.2  # if you change this, please also change version assert in petals/__init__.py
+    transformers==4.41.2  # if you change this, please also change version assert in petals/__init__.py
     speedtest-cli==2.1.3
     pydantic>=1.10,<2.0  # 2.0 is incompatible with hivemind yet
     hivemind==1.1.10.post2
@@ -50,6 +50,7 @@ install_requires =
     peft==0.5.0
     safetensors>=0.3.1
     Dijkstar>=2.6.0
+    numpy<2
 
 [options.extras_require]
 dev =

+ 2 - 2
src/petals/__init__.py

@@ -22,8 +22,8 @@ __version__ = "2.3.0.dev2"
 
 if not os.getenv("PETALS_IGNORE_DEPENDENCY_VERSION"):
     assert (
-        version.parse("4.38.2") <= version.parse(transformers.__version__) < version.parse("4.39.0")
-    ), "Please install a proper transformers version: pip install transformers>=4.37.1,<4.39.0"
+        version.parse("4.41.2") <= version.parse(transformers.__version__) < version.parse("4.42.0")
+    ), "Please install a proper transformers version: pip install transformers>=4.41.2,<4.42.0"
 
 
 def _override_bfloat16_mode_default():

+ 3 - 3
src/petals/client/remote_generation.py

@@ -22,20 +22,20 @@ class RemotePastKeyValues(Cache):
 
     def __init__(self) -> None:
         super().__init__()
-        self.seen_tokens = 0
+        self._seen_tokens = 0
         self.hypo_ids: Optional[torch.LongTensor] = None
 
     def __getitem__(self, _index: int) -> List[torch.Tensor]:
         return [DUMMY]  # For compatibility with BloomForCausalLM.prepare_inputs_for_generation()
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        return self.seen_tokens
+        return self._seen_tokens
 
     def get_max_length(self) -> Optional[int]:
         return None
 
     def update_seen(self, new_seen: int) -> None:
-        self.seen_tokens += new_seen
+        self._seen_tokens += new_seen
 
     def reorder_cache(self, beam_idx):
         raise NotImplementedError("Beam search reordering is not implemented yet")

+ 1 - 1
src/petals/models/bloom/model.py

@@ -131,7 +131,7 @@ class DistributedBloomForCausalLM(FromPretrainedMixin, RemoteGenerationMixin, Bl
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
+                past_length = past_key_values._seen_tokens
                 max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]

+ 1 - 4
src/petals/models/llama/block.py

@@ -87,10 +87,7 @@ class OptimizedLlamaAttention(LlamaAttention):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        cos, sin = self.rotary_emb(value_states, position_ids)
         cos, sin = cos.unsqueeze(1), sin.unsqueeze(1)
 
         if q_len == 1 and torch.is_inference_mode_enabled() and hidden_states.device.type == "cuda":