|
@@ -176,9 +176,11 @@ if __name__ == '__main__':
|
|
|
sum_perf += item.samples_per_second
|
|
|
num_samples += item.samples_accumulated
|
|
|
sum_mini_steps += item.mini_steps
|
|
|
+ current_loss = sum_loss / sum_mini_steps
|
|
|
+
|
|
|
if coordinator_args.wandb_project is not None:
|
|
|
wandb.log({
|
|
|
- "loss": sum_loss / sum_mini_steps,
|
|
|
+ "loss": current_loss,
|
|
|
"alive peers": alive_peers,
|
|
|
"samples": num_samples,
|
|
|
"performance": sum_perf
|
|
@@ -186,7 +188,7 @@ if __name__ == '__main__':
|
|
|
if checkpoint_handler.is_time_to_save_state(current_step):
|
|
|
checkpoint_handler.save_state(current_step)
|
|
|
if checkpoint_handler.is_time_to_upload():
|
|
|
- checkpoint_handler.upload_checkpoint(sum_loss / sum_mini_steps)
|
|
|
- logger.info(f"Step #{current_step}\tloss = {sum_loss / alive_peers:.5f}")
|
|
|
+ checkpoint_handler.upload_checkpoint(current_loss)
|
|
|
+ logger.info(f"Step #{current_step}\tloss = {current_loss:.5f}")
|
|
|
logger.debug("Peer is still alive...")
|
|
|
time.sleep(coordinator_args.refresh_period)
|