justheuristic 3 жил өмнө
parent
commit
e5f04cac23
2 өөрчлөгдсөн 5 нэмэгдсэн , 2 устгасан
  1. 1 1
      run_trainer_tpu.py
  2. 4 1
      task.py

+ 1 - 1
run_trainer_tpu.py

@@ -40,7 +40,7 @@ def main():
     tpu_manager = TPUManager(model, dataset=task.training_dataset, collate_fn=task.data_collator,
                              grad_accumulation_steps=trainer_args.gradient_accumulation_steps,
                              batch_size_per_device=trainer_args.per_device_train_batch_size,
-                             nprocs=trainer_args.n_tpus, start=True)
+                             nprocs=trainer_args.num_tpus, start=True)
 
     model = task.model = tpu_manager._synchronizer.master_model
 

+ 4 - 1
task.py

@@ -38,6 +38,9 @@ class ModelWrapper(nn.Module):
         super().__init__()
         self.model = model
 
+    def tie_weights(self):
+        pass
+
     def forward(self, input_ids, attention_mask, image):
         loss = self.model.forward(text=input_ids, image=image, mask=attention_mask, return_loss=True)
         return {'loss': loss}
@@ -64,7 +67,7 @@ class TrainingTask:
         if latest_checkpoint_dir is None:
             logger.info(f"Creating model")
 
-            depth = 64
+            depth = 16#TODO
             attn_types = list(islice(cycle(['axial_row', 'axial_col', 'axial_row', 'axial_row']), depth - 1))
             attn_types.append('conv_like')
             shared_layer_ids = list(islice(cycle(range(4)), depth - 1))