3 years ago · cdfca180bf
--- a/examples/albert/arguments.py
+++ b/examples/albert/arguments.py
@@ -127,7 +127,7 @@ class AlbertTrainingArguments(TrainingArguments):
 
															     gradient_accumulation_steps: int = 2
														
 
															     seq_length: int = 512
														
 
															-    total_steps: int = 125_000  # please note: this only affects the learning rate schedule
														
 
															+    total_steps: int = 125_000  # the number of global optimizer and scheduler steps
														
 
															     learning_rate: float = 0.00176
														
 
															     warmup_steps: int = 5000
														
 
															     adam_epsilon: float = 1e-6
														
@@ -148,4 +148,4 @@ class AlbertTrainingArguments(TrainingArguments):
 
															     save_total_limit: int = 2
														
 
															     save_steps: int = 500
														
 
															-    max_steps: int = 10 ** 30  # meant as "peer should compute gradients forever"
														
 
															+    max_steps: int = 10 ** 30  # the number of forward-backward passes before exit, defaults to "as many as you can"