Parcourir la source

Attempt training without offload

Max Ryabinin il y a 3 ans
Parent
commit
61f20884b9
2 fichiers modifiés avec 14 ajouts et 40 suppressions
  1. 14 2
      hivemind/moe/server/__init__.py
  2. 0 38
      hivemind/moe/server/layers/albert.py

+ 14 - 2
hivemind/moe/server/__init__.py

@@ -252,9 +252,8 @@ class Server(threading.Thread):
                 },
             ]
 
-            optim = OffloadOptimizer(
+            optim = LambWithGradientClipping(
                 optimizer_grouped_parameters,
-                optim_cls=LambWithGradientClipping,
                 lr=0.0035355339059327377,
                 betas=(0.9, 0.999),
                 eps=1e-6,
@@ -264,6 +263,18 @@ class Server(threading.Thread):
                 debias=True,
             )
 
+            # optim = OffloadOptimizer(
+            #     optimizer_grouped_parameters,
+            #     optim_cls=LambWithGradientClipping,
+            #     lr=0.0035355339059327377,
+            #     betas=(0.9, 0.999),
+            #     eps=1e-6,
+            #     weight_decay=0.01,
+            #     max_grad_norm=1,
+            #     clamp_value=10000.0,
+            #     debias=True,
+            # )
+
             expert.to(device)
 
             if use_averaging:
@@ -274,6 +285,7 @@ class Server(threading.Thread):
                     optim,
                     dht=dht,
                     prefix=expert_uid.split(UID_DELIMITER)[0],
+                    scheduler=scheduler,
                     compression=BASE_COMPRESSION_TYPES[averaging_compression],
                     state_compression=BASE_COMPRESSION_TYPES[averaging_compression],
                     target_batch_size=averaging_target_batch_size,

+ 0 - 38
hivemind/moe/server/layers/albert.py

@@ -543,44 +543,6 @@ class LeanAlbertTransformer(AlbertTransformer):
         )
 
 
-@add_start_docstrings(
-    "The bare LeanALBERT Model transformer outputting raw hidden-states without any specific head on top.",
-    ALBERT_START_DOCSTRING,
-)
-class LeanAlbertModel(AlbertModel):
-    config_class = LeanAlbertConfig
-
-    def __init__(self, config: AlbertConfig, add_pooling_layer=True):
-        PreTrainedModel.__init__(self, config)
-
-        self.config = config
-        self.embeddings = LeanAlbertEmbeddings(config)
-        self.encoder = LeanAlbertTransformer(config)
-
-        if add_pooling_layer:
-            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
-            self.pooler_activation = nn.Tanh()
-        else:
-            self.pooler = None
-            self.pooler_activation = None
-
-        self.init_weights()
-
-
-class LeanAlbertForPreTraining(AlbertForPreTraining, PreTrainedModel):
-    config_class = LeanAlbertConfig
-    base_model_prefix = "albert"
-
-    def __init__(self, config: AlbertConfig):
-        PreTrainedModel.__init__(self, config)
-
-        self.albert = LeanAlbertModel(config)
-        self.predictions = AlbertMLMHead(config)
-        self.sop_classifier = AlbertSOPHead(config)
-
-        self.init_weights()
-
-
 from hivemind.moe.server.layers.custom_experts import register_expert_class
 
 SEQUENCE_LENGTH = 2048