ソースを参照

Log correct loss in examples/albert/run_first_peer.py (#265)

#240 changed the formula for the current loss, but it was not changed in the `logger.info` call. Therefore, the coordinator now logs inadequate loss values.
Aleksandr Borzunov 4 年 前
コミット
9bb775fe04
1 ファイル変更5 行追加3 行削除
  1. 5 3
      examples/albert/run_first_peer.py

+ 5 - 3
examples/albert/run_first_peer.py

@@ -176,9 +176,11 @@ if __name__ == '__main__':
                     sum_perf += item.samples_per_second
                     num_samples += item.samples_accumulated
                     sum_mini_steps += item.mini_steps
+                current_loss = sum_loss / sum_mini_steps
+                
                 if coordinator_args.wandb_project is not None:
                     wandb.log({
-                        "loss": sum_loss / sum_mini_steps,
+                        "loss": current_loss,
                         "alive peers": alive_peers,
                         "samples": num_samples,
                         "performance": sum_perf
@@ -186,7 +188,7 @@ if __name__ == '__main__':
                 if checkpoint_handler.is_time_to_save_state(current_step):
                     checkpoint_handler.save_state(current_step)
                     if checkpoint_handler.is_time_to_upload():
-                        checkpoint_handler.upload_checkpoint(sum_loss / sum_mini_steps)
-                logger.info(f"Step #{current_step}\tloss = {sum_loss / alive_peers:.5f}")
+                        checkpoint_handler.upload_checkpoint(current_loss)
+                logger.info(f"Step #{current_step}\tloss = {current_loss:.5f}")
         logger.debug("Peer is still alive...")
         time.sleep(coordinator_args.refresh_period)