|
@@ -153,7 +153,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
|
|
self.performance_ema = PerformanceEMA(alpha=performance_ema_alpha)
|
|
self.performance_ema = PerformanceEMA(alpha=performance_ema_alpha)
|
|
self.last_step_time = None
|
|
self.last_step_time = None
|
|
|
|
|
|
- self.collaboration_state = self.fetch_collaboration_state()
|
|
|
|
|
|
+ self.collaboration_state = self._fetch_state()
|
|
self.lock_collaboration_state, self.collaboration_state_updated = Lock(), Event()
|
|
self.lock_collaboration_state, self.collaboration_state_updated = Lock(), Event()
|
|
self.lock_local_progress, self.should_report_progress = Lock(), Event()
|
|
self.lock_local_progress, self.should_report_progress = Lock(), Event()
|
|
self.progress_reporter = Thread(target=self.report_training_progress, daemon=True, name=f"{self}.reporter")
|
|
self.progress_reporter = Thread(target=self.report_training_progress, daemon=True, name=f"{self}.reporter")
|
|
@@ -237,8 +237,8 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
|
|
if not self.collaboration_state.ready_for_step:
|
|
if not self.collaboration_state.ready_for_step:
|
|
return
|
|
return
|
|
|
|
|
|
- logger.log(self.status_loglevel, f"Beginning global optimizer step {self.collaboration_state.optimizer_step}")
|
|
|
|
- self.collaboration_state = self.fetch_collaboration_state()
|
|
|
|
|
|
+ logger.log(self.status_loglevel, f"Beginning global optimizer step #{self.collaboration_state.optimizer_step}")
|
|
|
|
+ self.collaboration_state = self._fetch_state()
|
|
self.collaboration_state_updated.set()
|
|
self.collaboration_state_updated.set()
|
|
|
|
|
|
if not self.is_synchronized:
|
|
if not self.is_synchronized:
|
|
@@ -288,8 +288,8 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
|
|
if not self.collaboration_state.ready_for_step:
|
|
if not self.collaboration_state.ready_for_step:
|
|
return
|
|
return
|
|
|
|
|
|
- logger.log(self.status_loglevel, f"Beginning global optimizer step {self.collaboration_state.optimizer_step}")
|
|
|
|
- self.collaboration_state = self.fetch_collaboration_state()
|
|
|
|
|
|
+ logger.log(self.status_loglevel, f"Beginning global optimizer step #{self.collaboration_state.optimizer_step}")
|
|
|
|
+ self.collaboration_state = self._fetch_state()
|
|
self.collaboration_state_updated.set()
|
|
self.collaboration_state_updated.set()
|
|
|
|
|
|
with self.lock_collaboration_state:
|
|
with self.lock_collaboration_state:
|
|
@@ -392,9 +392,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
|
|
continue # if state was updated externally, reset timer
|
|
continue # if state was updated externally, reset timer
|
|
|
|
|
|
with self.lock_collaboration_state:
|
|
with self.lock_collaboration_state:
|
|
- self.collaboration_state = self.fetch_collaboration_state()
|
|
|
|
|
|
+ self.collaboration_state = self._fetch_state()
|
|
|
|
|
|
- def fetch_collaboration_state(self) -> CollaborationState:
|
|
|
|
|
|
+ def _fetch_state(self) -> CollaborationState:
|
|
"""Read performance statistics reported by peers, estimate progress towards next batch"""
|
|
"""Read performance statistics reported by peers, estimate progress towards next batch"""
|
|
response, _expiration = self.dht.get(self.training_progress_key, latest=True) or (None, -float("inf"))
|
|
response, _expiration = self.dht.get(self.training_progress_key, latest=True) or (None, -float("inf"))
|
|
current_time = get_dht_time()
|
|
current_time = get_dht_time()
|
|
@@ -452,9 +452,9 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
|
|
)
|
|
)
|
|
logger.log(
|
|
logger.log(
|
|
self.status_loglevel,
|
|
self.status_loglevel,
|
|
- f"Collaboration accumulated {total_samples_accumulated} samples from "
|
|
|
|
- f"{num_peers} peers; ETA {estimated_time_to_next_step:.2f} seconds "
|
|
|
|
- f"(refresh in {time_to_next_fetch:.2f}s.)",
|
|
|
|
|
|
+ f"{self.prefix} accumulated {total_samples_accumulated} samples from "
|
|
|
|
+ f"{num_peers} peers for step #{global_optimizer_step}. "
|
|
|
|
+ f"ETA {estimated_time_to_next_step:.2f} sec (refresh in {time_to_next_fetch:.2f} sec)",
|
|
)
|
|
)
|
|
return CollaborationState(
|
|
return CollaborationState(
|
|
global_optimizer_step,
|
|
global_optimizer_step,
|