пре 3 година · cdfca180bf
--- a/examples/albert/arguments.py
+++ b/examples/albert/arguments.py
@@ -127,7 +127,7 @@ class AlbertTrainingArguments(TrainingArguments):
 
				     gradient_accumulation_steps: int = 2
			
 
				     seq_length: int = 512
			
 
				 
			
 
				-    total_steps: int = 125_000  # please note: this only affects the learning rate schedule
			
 
				+    total_steps: int = 125_000  # the number of global optimizer and scheduler steps
			
 
				     learning_rate: float = 0.00176
			
 
				     warmup_steps: int = 5000
			
 
				     adam_epsilon: float = 1e-6
			
@@ -148,4 +148,4 @@ class AlbertTrainingArguments(TrainingArguments):
 
				 
			
 
				     save_total_limit: int = 2
			
 
				     save_steps: int = 500
			
 
				-    max_steps: int = 10 ** 30  # meant as "peer should compute gradients forever"
			
 
				+    max_steps: int = 10 ** 30  # the number of forward-backward passes before exit, defaults to "as many as you can"