Selaa lähdekoodia

Add timeouts, remove gated for tests

Max Ryabinin 3 vuotta sitten
vanhempi
commit
1cfd86ac5b

+ 0 - 2
hivemind/moe/server/__init__.py

@@ -277,8 +277,6 @@ class Server(threading.Thread):
 
             scheduler = scheduler(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps)
 
-            expert.to(device)
-
             if use_averaging:
                 assert averaging_target_batch_size is not None
                 assert averaging_target_group_size is not None

+ 10 - 1
hivemind/moe/server/expert_uid.py

@@ -1,5 +1,6 @@
 import random
 import re
+from time import sleep
 from typing import List, NamedTuple, Optional, Tuple, Union
 
 import hivemind
@@ -41,6 +42,8 @@ def generate_uids_from_pattern(
     expert_pattern: Optional[str],
     dht: Optional[DHT] = None,
     attempts_per_expert=10,
+    timeout=10,
+    random_sleep=True,
 ) -> List[str]:
     """
     Sample experts from a given pattern, optionally remove duplicates.
@@ -88,7 +91,7 @@ def generate_uids_from_pattern(
                 attempted_uids.add(new_uid)
                 new_uids.append(new_uid)
 
-        if dht:
+        if dht is not None:
             existing_expert_uids = {
                 found_expert.uid
                 for found_expert in hivemind.moe.server.get_experts(dht, new_uids)
@@ -98,6 +101,12 @@ def generate_uids_from_pattern(
 
         found_uids += new_uids
 
+        if len(found_uids) < num_experts:
+            if random_sleep:
+                sleep(timeout + random.random())
+            else:
+                sleep(timeout)
+
     if len(found_uids) != num_experts:
         logger.warning(
             f"Found only {len(found_uids)} out of {num_experts} free expert uids after "

+ 1 - 7
hivemind/moe/server/layers/albert.py

@@ -23,17 +23,11 @@ import torch.nn.functional as F
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.utils.checkpoint import checkpoint, get_device_states, set_device_states
 from transformers import AlbertConfig
-from transformers.file_utils import add_start_docstrings
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.albert.modeling_albert import (
     ACT2FN,
-    ALBERT_START_DOCSTRING,
-    AlbertForPreTraining,
     AlbertLayerGroup,
     AlbertMLMHead,
-    AlbertModel,
-    AlbertSOPHead,
     AlbertTransformer,
 )
 from transformers.utils import logging
@@ -46,7 +40,7 @@ _TOKENIZER_FOR_DOC = "AlbertTokenizer"
 
 class LeanAlbertConfig(AlbertConfig):
     rotary_embedding_base: int = 10_000
-    hidden_act_gated: bool = True
+    hidden_act_gated: bool = False
 
     def __hash__(self):
         return hash("\t".join(f"{k}={v}" for k, v in self.__dict__.items() if not k.startswith("_")))