|
@@ -23,6 +23,7 @@ class TrainingAverager(DecentralizedAverager):
|
|
:param opt: a pytorch optimizer to be averaged between peers (complete with model parameters)
|
|
:param opt: a pytorch optimizer to be averaged between peers (complete with model parameters)
|
|
:param average_parameters: whether or not to average model parameters in self.step(...)
|
|
:param average_parameters: whether or not to average model parameters in self.step(...)
|
|
:param average_gradients: whether or not to average model gradients in self.step(...)
|
|
:param average_gradients: whether or not to average model gradients in self.step(...)
|
|
|
|
+ :param average_opt_statistics: if specified, average optimizer statistics with corresponding names in statedict
|
|
:param initialize_optimizer: if True, this will run a speculative optimizer step with
|
|
:param initialize_optimizer: if True, this will run a speculative optimizer step with
|
|
zero gradients to initialize all tensors. If False, please initialize the optimizer state manually.
|
|
zero gradients to initialize all tensors. If False, please initialize the optimizer state manually.
|
|
:param extra_tensors: if specified, these extra tensors will also be averaged and shared in load_state_from_peers.
|
|
:param extra_tensors: if specified, these extra tensors will also be averaged and shared in load_state_from_peers.
|
|
@@ -30,9 +31,11 @@ class TrainingAverager(DecentralizedAverager):
|
|
:param kwargs: any additional parameters will be forwarded to DecentralizedAverager
|
|
:param kwargs: any additional parameters will be forwarded to DecentralizedAverager
|
|
"""
|
|
"""
|
|
def __init__(self, opt: torch.optim.Optimizer, *, average_parameters: bool, average_gradients: bool,
|
|
def __init__(self, opt: torch.optim.Optimizer, *, average_parameters: bool, average_gradients: bool,
|
|
- extra_tensors: Sequence[torch.Tensor] = (), initialize_optimizer: bool = True, **kwargs):
|
|
|
|
|
|
+ average_opt_statistics: Sequence[str] = (), extra_tensors: Sequence[torch.Tensor] = (),
|
|
|
|
+ initialize_optimizer: bool = True, **kwargs):
|
|
|
|
|
|
self.opt, self.extra_tensors, self.local_step = opt, tuple(extra_tensors), 0
|
|
self.opt, self.extra_tensors, self.local_step = opt, tuple(extra_tensors), 0
|
|
|
|
+ self.opt_statistics = tuple(average_opt_statistics)
|
|
self.average_parameters, self.average_gradients = average_parameters, average_gradients
|
|
self.average_parameters, self.average_gradients = average_parameters, average_gradients
|
|
self.lock_averager_step = Lock()
|
|
self.lock_averager_step = Lock()
|
|
if initialize_optimizer:
|
|
if initialize_optimizer:
|
|
@@ -46,7 +49,7 @@ class TrainingAverager(DecentralizedAverager):
|
|
def step(self, wait: bool = True, **kwargs):
|
|
def step(self, wait: bool = True, **kwargs):
|
|
""" Average optimizer weights and gradients with peers. """
|
|
""" Average optimizer weights and gradients with peers. """
|
|
if not wait:
|
|
if not wait:
|
|
- return run_in_background(self.step, wait=False, **kwargs)
|
|
|
|
|
|
+ return run_in_background(self.step, wait=True, **kwargs)
|
|
|
|
|
|
local_tensors = list(self.local_tensors())
|
|
local_tensors = list(self.local_tensors())
|
|
with self.lock_averager_step:
|
|
with self.lock_averager_step:
|
|
@@ -85,6 +88,10 @@ class TrainingAverager(DecentralizedAverager):
|
|
yield param.grad
|
|
yield param.grad
|
|
elif replace_none:
|
|
elif replace_none:
|
|
yield torch.zeros_like(param)
|
|
yield torch.zeros_like(param)
|
|
|
|
+ for stats in self.opt_statistics:
|
|
|
|
+ for param_group in self.opt.param_groups:
|
|
|
|
+ for param in param_group['params']:
|
|
|
|
+ yield self.opt.state[param][stats]
|
|
yield from iter(self.extra_tensors)
|
|
yield from iter(self.extra_tensors)
|
|
|
|
|
|
def get_current_state(self):
|
|
def get_current_state(self):
|