Browse Source

rename to params

Max Ryabinin 3 years ago
parent
commit
9554720833

+ 1 - 1
benchmarks/benchmark_optimizer.py

@@ -74,7 +74,7 @@ def _run_training_with_swarm(args: TrainingArguments):
         optimizer = Optimizer(
         optimizer = Optimizer(
             prefix=args.prefix,
             prefix=args.prefix,
             target_batch_size=args.target_batch_size,
             target_batch_size=args.target_batch_size,
-            param_groups=model.parameters(),
+            params=model.parameters(),
             optimizer=partial(torch.optim.SGD, lr=args.lr_base),
             optimizer=partial(torch.optim.SGD, lr=args.lr_base),
             scheduler=partial(torch.optim.lr_scheduler.StepLR, gamma=args.lr_gamma, step_size=args.lr_step_size),
             scheduler=partial(torch.optim.lr_scheduler.StepLR, gamma=args.lr_gamma, step_size=args.lr_step_size),
             dht=hivemind.DHT(initial_peers=dht.get_visible_maddrs(), client_mode=client_mode, start=True),
             dht=hivemind.DHT(initial_peers=dht.get_visible_maddrs(), client_mode=client_mode, start=True),

+ 3 - 3
hivemind/optim/experimental/optimizer.py

@@ -58,7 +58,7 @@ class Optimizer(torch.optim.Optimizer):
     :param target_batch_size: perform optimizer step after all peers collectively accumulate this many samples
     :param target_batch_size: perform optimizer step after all peers collectively accumulate this many samples
     :param batch_size_per_step: before each call to .step, user should accumulate gradients over this many samples
     :param batch_size_per_step: before each call to .step, user should accumulate gradients over this many samples
     :param optimizer: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
     :param optimizer: a standard pytorch optimizer, preferably a large-batch one such as LAMB, LARS, etc.
-    :param param_groups: optional, a list/tuple of parameters or structured param groups for the optimizer
+    :param params: optional, a list/tuple of parameters or structured param groups for the optimizer
     :param scheduler: if specified, use this scheduler to update optimizer learning rate
     :param scheduler: if specified, use this scheduler to update optimizer learning rate
     :note: If you are using ColloptaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
     :note: If you are using ColloptaborativeOptimizer with lr_scheduler, it is recommended to pass this scheduler
       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
       explicitly into this class. Otherwise, scheduler may not be synchronized between peers.
@@ -92,7 +92,7 @@ class Optimizer(torch.optim.Optimizer):
         target_batch_size: int,
         target_batch_size: int,
         batch_size_per_step: Optional[int] = None,
         batch_size_per_step: Optional[int] = None,
         optimizer: Union[TorchOptimizer, OptimizerFactory],
         optimizer: Union[TorchOptimizer, OptimizerFactory],
-        param_groups: Optional[Union[Parameters, ParamGroups]] = None,
+        params: Optional[Union[Parameters, ParamGroups]] = None,
         scheduler: Optional[Union[LRSchedulerBase, SchedulerFactory]] = None,
         scheduler: Optional[Union[LRSchedulerBase, SchedulerFactory]] = None,
         matchmaking_time: Optional[float] = 15.0,
         matchmaking_time: Optional[float] = 15.0,
         averaging_timeout: Optional[float] = 300.0,
         averaging_timeout: Optional[float] = 300.0,
@@ -115,7 +115,7 @@ class Optimizer(torch.optim.Optimizer):
         self.scheduled_round: Optional[StepControl] = None
         self.scheduled_round: Optional[StepControl] = None
 
 
         self.state_averager = self._make_state_averager(
         self.state_averager = self._make_state_averager(
-            optimizer=optimizer, param_groups=param_groups, scheduler=scheduler, **averager_opts or {}
+            optimizer=optimizer, params=params, scheduler=scheduler, **averager_opts or {}
         )
         )
         self.grad_averager = self._make_gradient_averager(reuse_grad_buffers=reuse_grad_buffers, **averager_opts or {})
         self.grad_averager = self._make_gradient_averager(reuse_grad_buffers=reuse_grad_buffers, **averager_opts or {})
         self.tracker = self._make_progress_tracker(target_batch_size, **tracker_opts or {})
         self.tracker = self._make_progress_tracker(target_batch_size, **tracker_opts or {})

+ 6 - 6
hivemind/optim/experimental/state_averager.py

@@ -36,7 +36,7 @@ class TrainingStateAverager(DecentralizedAverager):
 
 
     Example:
     Example:
 
 
-    >>> avgr = TrainingStateAverager(optimizer=torch.optim.Adam, param_groups=model.parameters(), ...)
+    >>> avgr = TrainingStateAverager(optimizer=torch.optim.Adam, params=model.parameters(), ...)
     >>> # alternative interface: TrainingStateAverager(optimizer=torch.optim.Adam(model.parameters()), ...)
     >>> # alternative interface: TrainingStateAverager(optimizer=torch.optim.Adam(model.parameters()), ...)
     >>> avgr.load_state_from_peers()
     >>> avgr.load_state_from_peers()
     >>> for i, batch in enumerate(training_dataloader):
     >>> for i, batch in enumerate(training_dataloader):
@@ -49,7 +49,7 @@ class TrainingStateAverager(DecentralizedAverager):
       TrainingStateAverager.step(..., optimizer_step=True)
       TrainingStateAverager.step(..., optimizer_step=True)
 
 
     :param optimizer: PyTorch Optimizer or a callable that creates a optimizer from param groups
     :param optimizer: PyTorch Optimizer or a callable that creates a optimizer from param groups
-    :param param_groups: optional, a list/tuple of parameters or structured param groups for the optimizer
+    :param params: optional, a list/tuple of parameters or structured param groups for the optimizer
     :param scheduler: optional learning rate scheduler or callable that creates one from optimizer instance
     :param scheduler: optional learning rate scheduler or callable that creates one from optimizer instance
     :note: if provided, scheduler will be updated based on averager.local_epoch, not the number of step cycles
     :note: if provided, scheduler will be updated based on averager.local_epoch, not the number of step cycles
     :param initialize_optimizer: if True, run a speculative optimizer step with zero gradients to initialize all
     :param initialize_optimizer: if True, run a speculative optimizer step with zero gradients to initialize all
@@ -61,7 +61,7 @@ class TrainingStateAverager(DecentralizedAverager):
     :param reuse_tensors: if True, reuse parameters and optimizer statistics as averaged_tensors for allreduce.
     :param reuse_tensors: if True, reuse parameters and optimizer statistics as averaged_tensors for allreduce.
       For this to work, all parameters must be on CPU and have the appropriate dtype for use in DecentralizedAverager
       For this to work, all parameters must be on CPU and have the appropriate dtype for use in DecentralizedAverager
     :param sync_epoch_when_averaging: if True, update local epoch to the latest epoch among averaging peers
     :param sync_epoch_when_averaging: if True, update local epoch to the latest epoch among averaging peers
-    :param parameter_names: optionally provide parameter names in the same order as param_groups
+    :param parameter_names: optionally provide parameter names in the same order as in params
     :param average_opt_statistics: names of optimizer statistics from state dict that should be averaged with peers
     :param average_opt_statistics: names of optimizer statistics from state dict that should be averaged with peers
     :param extra_tensors: if specified, these extra tensors will also be averaged and shared in load_state_from_peers.
     :param extra_tensors: if specified, these extra tensors will also be averaged and shared in load_state_from_peers.
     :note: you can use extra_tensors to for any tensors not used by the optimizer (e.g. batchnorm statistics)
     :note: you can use extra_tensors to for any tensors not used by the optimizer (e.g. batchnorm statistics)
@@ -73,7 +73,7 @@ class TrainingStateAverager(DecentralizedAverager):
         *,
         *,
         dht: hivemind.DHT,
         dht: hivemind.DHT,
         optimizer: Union[TorchOptimizer, OptimizerFactory],
         optimizer: Union[TorchOptimizer, OptimizerFactory],
-        param_groups: Optional[Union[Parameters, ParamGroups]] = None,
+        params: Optional[Union[Parameters, ParamGroups]] = None,
         scheduler: Optional[Union[LRSchedulerBase, SchedulerFactory]] = None,
         scheduler: Optional[Union[LRSchedulerBase, SchedulerFactory]] = None,
         initialize_optimizer: Optional[bool] = None,
         initialize_optimizer: Optional[bool] = None,
         offload_optimizer: bool = False,
         offload_optimizer: bool = False,
@@ -93,7 +93,7 @@ class TrainingStateAverager(DecentralizedAverager):
         if custom_gradients and not offload_optimizer:
         if custom_gradients and not offload_optimizer:
             logger.warning("Setting custom_gradients=True has no effect because the optimizer is not offloaded")
             logger.warning("Setting custom_gradients=True has no effect because the optimizer is not offloaded")
 
 
-        param_groups, main_parameters, parameter_names = self._check_params(optimizer, param_groups, parameter_names)
+        params_groups, main_parameters, parameter_names = self._check_params(optimizer, params, parameter_names)
 
 
         self.status_loglevel = status_loglevel
         self.status_loglevel = status_loglevel
         self.reuse_tensors = reuse_tensors
         self.reuse_tensors = reuse_tensors
@@ -103,7 +103,7 @@ class TrainingStateAverager(DecentralizedAverager):
         self.main_parameters, self.parameter_names = main_parameters, parameter_names
         self.main_parameters, self.parameter_names = main_parameters, parameter_names
         self._averaged_parameters = tuple(map(self._make_host_tensor, main_parameters))
         self._averaged_parameters = tuple(map(self._make_host_tensor, main_parameters))
         self.optimizer, self.scheduler = self._init_components(
         self.optimizer, self.scheduler = self._init_components(
-            param_groups, optimizer, scheduler, initialize_optimizer
+            params_groups, optimizer, scheduler, initialize_optimizer
         )
         )
         self.opt_keys_for_averaging, self.extra_tensors = average_opt_statistics, extra_tensors
         self.opt_keys_for_averaging, self.extra_tensors = average_opt_statistics, extra_tensors
         self.sync_epoch_when_averaging = sync_epoch_when_averaging
         self.sync_epoch_when_averaging = sync_epoch_when_averaging

+ 4 - 4
tests/test_optimizer.py

@@ -106,10 +106,10 @@ def test_state_averager(offload_optimizer: bool, reuse_tensors: bool, sync_epoch
     )
     )
 
 
     avgr1 = TrainingStateAverager(
     avgr1 = TrainingStateAverager(
-        dht=dht1, param_groups=model1.parameters(), extra_tensors=extras1, start=True, **common_kwargs
+        dht=dht1, params=model1.parameters(), extra_tensors=extras1, start=True, **common_kwargs
     )
     )
     avgr2 = TrainingStateAverager(
     avgr2 = TrainingStateAverager(
-        dht=dht2, param_groups=model2.parameters(), extra_tensors=extras2, start=True, **common_kwargs
+        dht=dht2, params=model2.parameters(), extra_tensors=extras2, start=True, **common_kwargs
     )
     )
 
 
     x = torch.ones(2)
     x = torch.ones(2)
@@ -161,10 +161,10 @@ def test_load_state_from_peers():
     )
     )
 
 
     avgr1 = TrainingStateAverager(
     avgr1 = TrainingStateAverager(
-        dht=dht1, param_groups=model1.parameters(), allow_state_sharing=False, start=True, **common_kwargs
+        dht=dht1, params=model1.parameters(), allow_state_sharing=False, start=True, **common_kwargs
     )
     )
 
 
-    avgr2 = TrainingStateAverager(dht=dht2, param_groups=model2.parameters(), start=True, **common_kwargs)
+    avgr2 = TrainingStateAverager(dht=dht2, params=model2.parameters(), start=True, **common_kwargs)
 
 
     avgr2.local_epoch = 1337
     avgr2.local_epoch = 1337
     model2.weight.data[...] = 42
     model2.weight.data[...] = 42