3 жил өмнө · e5f04cac23
--- a/run_trainer_tpu.py
+++ b/run_trainer_tpu.py
@@ -40,7 +40,7 @@ def main():
 
				     tpu_manager = TPUManager(model, dataset=task.training_dataset, collate_fn=task.data_collator,
			
 
				                              grad_accumulation_steps=trainer_args.gradient_accumulation_steps,
			
 
				                              batch_size_per_device=trainer_args.per_device_train_batch_size,
			
 
				-                             nprocs=trainer_args.n_tpus, start=True)
			
 
				+                             nprocs=trainer_args.num_tpus, start=True)
			
 
				 
			
 
				     model = task.model = tpu_manager._synchronizer.master_model
			
 
				 
			
--- a/task.py
+++ b/task.py
@@ -38,6 +38,9 @@ class ModelWrapper(nn.Module):
 
				         super().__init__()
			
 
				         self.model = model
			
 
				 
			
 
				+    def tie_weights(self):
			
 
				+        pass
			
 
				+
			
 
				     def forward(self, input_ids, attention_mask, image):
			
 
				         loss = self.model.forward(text=input_ids, image=image, mask=attention_mask, return_loss=True)
			
 
				         return {'loss': loss}
			
@@ -64,7 +67,7 @@ class TrainingTask:
 
				         if latest_checkpoint_dir is None:
			
 
				             logger.info(f"Creating model")
			
 
				 
			
 
				-            depth = 64
			
 
				+            depth = 16#TODO
			
 
				             attn_types = list(islice(cycle(['axial_row', 'axial_col', 'axial_row', 'axial_row']), depth - 1))
			
 
				             attn_types.append('conv_like')
			
 
				             shared_layer_ids = list(islice(cycle(range(4)), depth - 1))