|
@@ -127,7 +127,7 @@ class AlbertTrainingArguments(TrainingArguments):
|
|
gradient_accumulation_steps: int = 2
|
|
gradient_accumulation_steps: int = 2
|
|
seq_length: int = 512
|
|
seq_length: int = 512
|
|
|
|
|
|
- total_steps: int = 125_000 # please note: this only affects the learning rate schedule
|
|
|
|
|
|
+ total_steps: int = 125_000 # the number of global optimizer and scheduler steps
|
|
learning_rate: float = 0.00176
|
|
learning_rate: float = 0.00176
|
|
warmup_steps: int = 5000
|
|
warmup_steps: int = 5000
|
|
adam_epsilon: float = 1e-6
|
|
adam_epsilon: float = 1e-6
|
|
@@ -148,4 +148,4 @@ class AlbertTrainingArguments(TrainingArguments):
|
|
|
|
|
|
save_total_limit: int = 2
|
|
save_total_limit: int = 2
|
|
save_steps: int = 500
|
|
save_steps: int = 500
|
|
- max_steps: int = 10 ** 30 # meant as "peer should compute gradients forever"
|
|
|
|
|
|
+ max_steps: int = 10 ** 30 # the number of forward-backward passes before exit, defaults to "as many as you can"
|