|
@@ -1,7 +1,8 @@
|
|
-#!/usr/bin/env python
|
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
|
import os
|
|
import os
|
|
import pickle
|
|
import pickle
|
|
|
|
+import sys
|
|
from dataclasses import asdict
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
|
|
|
@@ -58,36 +59,6 @@ def get_model(training_args, config, tokenizer):
|
|
return model
|
|
return model
|
|
|
|
|
|
|
|
|
|
-def get_optimizer_and_scheduler(training_args, model):
|
|
|
|
- no_decay = ["bias", "LayerNorm.weight"]
|
|
|
|
- optimizer_grouped_parameters = [
|
|
|
|
- {
|
|
|
|
- "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
|
|
|
- "weight_decay": training_args.weight_decay,
|
|
|
|
- },
|
|
|
|
- {
|
|
|
|
- "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
|
|
|
- "weight_decay": 0.0,
|
|
|
|
- },
|
|
|
|
- ]
|
|
|
|
-
|
|
|
|
- opt = Lamb(
|
|
|
|
- optimizer_grouped_parameters,
|
|
|
|
- lr=training_args.learning_rate,
|
|
|
|
- betas=(training_args.adam_beta1, training_args.adam_beta2),
|
|
|
|
- eps=training_args.adam_epsilon,
|
|
|
|
- weight_decay=training_args.weight_decay,
|
|
|
|
- clamp_value=training_args.clamp_value,
|
|
|
|
- debias=True,
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- scheduler = get_linear_schedule_with_warmup(
|
|
|
|
- opt, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- return opt, scheduler
|
|
|
|
-
|
|
|
|
-
|
|
|
|
class CollaborativeCallback(transformers.TrainerCallback):
|
|
class CollaborativeCallback(transformers.TrainerCallback):
|
|
"""
|
|
"""
|
|
This callback monitors and reports collaborative training progress.
|
|
This callback monitors and reports collaborative training progress.
|
|
@@ -149,9 +120,9 @@ class CollaborativeCallback(transformers.TrainerCallback):
|
|
)
|
|
)
|
|
logger.info(f"Step #{self.optimizer.local_epoch}")
|
|
logger.info(f"Step #{self.optimizer.local_epoch}")
|
|
logger.info(f"Your current contribution: {self.total_samples_processed} samples")
|
|
logger.info(f"Your current contribution: {self.total_samples_processed} samples")
|
|
- logger.info(f"Performance: {samples_per_second} samples per second.")
|
|
|
|
|
|
+ logger.info(f"Performance: {samples_per_second:.3f} samples/sec")
|
|
if self.steps:
|
|
if self.steps:
|
|
- logger.info(f"Local loss: {self.loss / self.steps}")
|
|
|
|
|
|
+ logger.info(f"Local loss: {self.loss / self.steps:.5f}")
|
|
if self.optimizer.local_epoch % self.backup_every_steps == 0:
|
|
if self.optimizer.local_epoch % self.backup_every_steps == 0:
|
|
self.latest_backup = self.backup_state()
|
|
self.latest_backup = self.backup_state()
|
|
|
|
|
|
@@ -219,10 +190,7 @@ def main():
|
|
)
|
|
)
|
|
)
|
|
)
|
|
training_args, dataset_args, collaboration_args, averager_args, tracker_args = parser.parse_args_into_dataclasses()
|
|
training_args, dataset_args, collaboration_args, averager_args, tracker_args = parser.parse_args_into_dataclasses()
|
|
-
|
|
|
|
logger.info(f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}")
|
|
logger.info(f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}")
|
|
- if len(collaboration_args.initial_peers) == 0:
|
|
|
|
- raise ValueError("Please specify at least one network endpoint in initial peers.")
|
|
|
|
|
|
|
|
setup_transformers_logging(training_args.local_rank)
|
|
setup_transformers_logging(training_args.local_rank)
|
|
logger.info(f"Training/evaluation parameters:\n{training_args}")
|
|
logger.info(f"Training/evaluation parameters:\n{training_args}")
|
|
@@ -231,7 +199,15 @@ def main():
|
|
set_seed(training_args.seed)
|
|
set_seed(training_args.seed)
|
|
|
|
|
|
config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir)
|
|
config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir)
|
|
- tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)
|
|
|
|
|
|
+ try:
|
|
|
|
+ tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)
|
|
|
|
+ except OSError:
|
|
|
|
+ logger.fatal(
|
|
|
|
+ f"No tokenizer data found in {dataset_args.tokenizer_path}, "
|
|
|
|
+ f"please run ./tokenize_wikitext103.py before running this"
|
|
|
|
+ )
|
|
|
|
+ sys.exit(1)
|
|
|
|
+
|
|
model = get_model(training_args, config, tokenizer)
|
|
model = get_model(training_args, config, tokenizer)
|
|
model.to(training_args.device)
|
|
model.to(training_args.device)
|
|
|
|
|
|
@@ -239,8 +215,6 @@ def main():
|
|
# This data collator will take care of randomly masking the tokens.
|
|
# This data collator will take care of randomly masking the tokens.
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
|
|
|
|
|
|
- opt, scheduler = get_optimizer_and_scheduler(training_args, model)
|
|
|
|
-
|
|
|
|
validators, local_public_key = utils.make_validators(collaboration_args.experiment_prefix)
|
|
validators, local_public_key = utils.make_validators(collaboration_args.experiment_prefix)
|
|
|
|
|
|
dht = DHT(
|
|
dht = DHT(
|
|
@@ -261,12 +235,41 @@ def main():
|
|
|
|
|
|
adjusted_target_batch_size = collaboration_args.target_batch_size - collaboration_args.batch_size_lead
|
|
adjusted_target_batch_size = collaboration_args.target_batch_size - collaboration_args.batch_size_lead
|
|
|
|
|
|
|
|
+ # We need to make such a lambda function instead of just an optimizer instance
|
|
|
|
+ # to make hivemind.Optimizer(..., offload_optimizer=True) work
|
|
|
|
+ opt = lambda params: Lamb(
|
|
|
|
+ params,
|
|
|
|
+ lr=training_args.learning_rate,
|
|
|
|
+ betas=(training_args.adam_beta1, training_args.adam_beta2),
|
|
|
|
+ eps=training_args.adam_epsilon,
|
|
|
|
+ weight_decay=training_args.weight_decay,
|
|
|
|
+ clamp_value=training_args.clamp_value,
|
|
|
|
+ debias=True,
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ no_decay = ["bias", "LayerNorm.weight"]
|
|
|
|
+ params = [
|
|
|
|
+ {
|
|
|
|
+ "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
|
|
|
+ "weight_decay": training_args.weight_decay,
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
|
|
|
+ "weight_decay": 0.0,
|
|
|
|
+ },
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ scheduler = lambda opt: get_linear_schedule_with_warmup(
|
|
|
|
+ opt, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
|
|
|
|
+ )
|
|
|
|
+
|
|
optimizer = Optimizer(
|
|
optimizer = Optimizer(
|
|
dht=dht,
|
|
dht=dht,
|
|
run_id=collaboration_args.experiment_prefix,
|
|
run_id=collaboration_args.experiment_prefix,
|
|
target_batch_size=adjusted_target_batch_size,
|
|
target_batch_size=adjusted_target_batch_size,
|
|
batch_size_per_step=total_batch_size_per_step,
|
|
batch_size_per_step=total_batch_size_per_step,
|
|
optimizer=opt,
|
|
optimizer=opt,
|
|
|
|
+ params=params,
|
|
scheduler=scheduler,
|
|
scheduler=scheduler,
|
|
matchmaking_time=collaboration_args.matchmaking_time,
|
|
matchmaking_time=collaboration_args.matchmaking_time,
|
|
averaging_timeout=collaboration_args.averaging_timeout,
|
|
averaging_timeout=collaboration_args.averaging_timeout,
|