Forráskód Böngészése

Remove trailing dots in log messages and errors (#419)

Alexander Borzunov 3 éve
szülő
commit
12099ce03c

+ 1 - 1
hivemind/averaging/allreduce.py

@@ -245,7 +245,7 @@ class AllReduceRunner(ServicerBase):
         else:
             error_code = averaging_pb2.MessageCode.Name(request.code)
             logger.debug(f"{self} - peer {context.remote_id} sent {error_code}, allreduce cannot continue")
-            self.finalize(exception=AllreduceException(f"peer {context.remote_id} sent {error_code}."))
+            self.finalize(exception=AllreduceException(f"Peer {context.remote_id} sent {error_code}"))
             yield averaging_pb2.AveragingData(code=averaging_pb2.INTERNAL_ERROR)
 
     def _check_reasons_to_reject(self, request: averaging_pb2.AveragingData) -> Optional[averaging_pb2.AveragingData]:

+ 9 - 9
hivemind/averaging/averager.py

@@ -222,7 +222,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
     @allow_state_sharing.setter
     def allow_state_sharing(self, value: bool):
         if value and self.client_mode:
-            raise ValueError("Cannot allow state sharing: averager in client mode cannot share its state.")
+            raise ValueError("Cannot allow state sharing: averager in client mode cannot share its state")
         else:
             old_value, self._allow_state_sharing.value = self._allow_state_sharing.value, value
             if value != old_value:
@@ -236,7 +236,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
     @state_sharing_priority.setter
     def state_sharing_priority(self, value: float):
         if value and self.client_mode:
-            raise ValueError("State sharing priority is unused: averager in client mode cannot share its state.")
+            raise ValueError("State sharing priority is unused: averager in client mode cannot share its state")
         else:
             old_value, self._state_sharing_priority.value = self._state_sharing_priority.value, value
             if self.allow_state_sharing and value != old_value:
@@ -278,7 +278,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
                 if not self.client_mode:
                     await self.add_p2p_handlers(self._p2p, namespace=self.prefix)
                 else:
-                    logger.debug(f"The averager is running in client mode.")
+                    logger.debug("The averager is running in client mode")
 
                 self._matchmaking = Matchmaking(
                     self._p2p,
@@ -339,7 +339,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
             self._inner_pipe.send(("_SHUTDOWN", None))  # shut down background thread in master
             self.join(self.shutdown_timeout)
             if self.is_alive():
-                logger.warning("Averager did not shut down within the grace period; terminating it the hard way.")
+                logger.warning("Averager did not shut down within the grace period; terminating it the hard way")
                 self.terminate()
         else:
             logger.exception("Averager shutdown has no effect: the process is already not alive")
@@ -380,7 +380,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
         :returns: on success, update averaged_tensors and return group info; on failure, return None
         """
         if self.mode == AveragingMode.AUX and weight is not None:
-            logger.warning("Averager is running in auxiliary mode, weight is unused.")
+            logger.warning("Averager is running in auxiliary mode, weight is unused")
         if scheduled_time is None:
             scheduled_time = get_dht_time() + self.matchmaking_kwargs["min_matchmaking_time"]
         if weight is None:
@@ -448,7 +448,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
                     group_info = await matchmaking_task
 
                     if group_info is None:
-                        raise AllreduceException("Averaging step failed: could not find a group.")
+                        raise AllreduceException("Averaging step failed: could not find a group")
 
                     step.stage = AveragingStage.RUNNING_ALLREDUCE
 
@@ -499,7 +499,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
             stub = type(self).get_stub(self._p2p, peer_id, namespace=self.prefix)
             await afirst(await stub.rpc_aggregate_part(as_aiter(error)))
         except Exception as e:
-            logger.debug(f"Caught {e} when sending error {averaging_pb2.MessageCode.Name(code)} to {peer_id}.")
+            logger.debug(f"Caught {e} when sending error {averaging_pb2.MessageCode.Name(code)} to {peer_id}")
 
     async def _run_allreduce(self, group_info: GroupInfo, min_vector_size: int, **kwargs) -> GatheredData:
         """Run All-Reduce in a given group and update tensors in place, return gathered metadata"""
@@ -687,7 +687,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
             }
 
             if not isinstance(peer_priority, dict) or len(peer_priority) == 0:
-                logger.info(f"Averager could not load state from peers: peer dict empty or corrupted {peer_priority}.")
+                logger.info(f"Averager could not load state from peers: peer dict empty or corrupted {peer_priority}")
                 future.set_result(None)
                 return
 
@@ -712,7 +712,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
                             tensors.append(deserialize_torch_tensor(combine_from_streaming(current_tensor_parts)))
 
                         if not metadata:
-                            logger.debug(f"Peer {peer} did not send its state.")
+                            logger.debug(f"Peer {peer} did not send its state")
                             continue
 
                         logger.info(f"Finished downloading state from {peer}")

+ 1 - 1
hivemind/averaging/control.py

@@ -84,7 +84,7 @@ class StepControl(MPFuture):
         if self.began_allreduce:
             logger.warning("Changing scheduled time has no effect after all-reduce has already started")
         if scheduled_time >= self.deadline:
-            logger.warning("Changing scheduled time to after deadline, averaging will likely fail due to timeout.")
+            logger.warning("Changing scheduled time to after deadline, averaging will likely fail due to timeout")
         struct.pack_into("d", self._shared_buffer[StepControl._SCHEDULED_TIME].numpy().data, 0, float(scheduled_time))
 
     @property

+ 2 - 2
hivemind/averaging/key_manager.py

@@ -34,7 +34,7 @@ class GroupKeyManager:
     ):
         assert all(bit in "01" for bit in initial_group_bits)
         if target_group_size is not None and not is_power_of_two(target_group_size):
-            logger.warning("It is recommended to set target_group_size to a power of 2.")
+            logger.warning("It is recommended to set target_group_size to a power of 2")
 
         self.dht, self.prefix, self.group_bits = dht, prefix, initial_group_bits
         self.target_group_size = target_group_size
@@ -80,7 +80,7 @@ class GroupKeyManager:
         assert is_valid_group(group_key), f"Group key {group_key} is invalid, must follow {GROUP_PATTERN}"
         result = await self.dht.get(group_key, latest=True, return_future=True)
         if result is None or not isinstance(result.value, dict):
-            logger.debug(f"Allreduce group not found: {group_key}, creating new group.")
+            logger.debug(f"Allreduce group not found: {group_key}, creating new group")
             return []
         averagers = []
         for key, looking_for_group in result.value.items():

+ 1 - 1
hivemind/averaging/load_balancing.py

@@ -80,7 +80,7 @@ def optimize_parts_lp(vector_size: int, bandwidths: np.ndarray, min_size: int =
             peer_scores[peer_scores < min_size / float(vector_size)] = 0.0
         peer_scores = np.round(peer_scores, LOAD_BALANCING_LP_DECIMALS)
     else:
-        logger.error(f"Failed to solve load-balancing for bandwidths {bandwidths}.")
+        logger.error(f"Failed to solve load-balancing for bandwidths {bandwidths}")
         peer_scores = np.ones(group_size, c.dtype)
 
     return peer_scores[np.argsort(permutation)]

+ 2 - 2
hivemind/averaging/matchmaking.py

@@ -378,7 +378,7 @@ class Matchmaking:
             for peer_id in ordered_peer_ids
         )
 
-        logger.debug(f"{self.peer_id} - assembled group of {len(ordered_peer_ids)} peers.")
+        logger.debug(f"{self.peer_id} - assembled group of {len(ordered_peer_ids)} peers")
         group_info = GroupInfo(group_id, tuple(ordered_peer_ids), gathered)
         await self.group_key_manager.update_key_on_group_assembled(group_info, is_leader=True)
         self.assembled_group.set_result(group_info)
@@ -395,7 +395,7 @@ class Matchmaking:
         assert self.peer_id in ordered_peer_ids, "Leader sent us group_peer_ids that does not contain us!"
         assert len(ordered_peer_ids) == len(msg.gathered)
 
-        logger.debug(f"{self.peer_id} - follower assembled group with leader {leader}.")
+        logger.debug(f"{self.peer_id} - follower assembled group with leader {leader}")
         group_info = GroupInfo(group_id, tuple(ordered_peer_ids), tuple(msg.gathered))
         await self.group_key_manager.update_key_on_group_assembled(group_info)
         self.assembled_group.set_result(group_info)

+ 1 - 1
hivemind/averaging/partition.py

@@ -139,7 +139,7 @@ class TensorPartContainer:
                     self._output_part_available[peer_index].clear()
                     await self._output_part_available[peer_index].wait()
                     if self.finished.is_set():
-                        raise AllreduceException("All-reduce was terminated during iteration.")
+                        raise AllreduceException("All-reduce was terminated during iteration")
 
                 tensor_parts.append(self._output_parts_by_peer[peer_index].popleft())
                 num_parts_processed += 1

+ 1 - 1
hivemind/dht/dht.py

@@ -151,7 +151,7 @@ class DHT(mp.Process):
             self._outer_pipe.send(("_shutdown", [], {}))
             self.join(self.shutdown_timeout)
             if self.is_alive():
-                logger.warning("DHT did not shut down within the grace period; terminating it the hard way.")
+                logger.warning("DHT did not shut down within the grace period; terminating it the hard way")
                 self.terminate()
 
     async def _shutdown(self):

+ 1 - 1
hivemind/dht/node.py

@@ -717,7 +717,7 @@ class DHTNode:
         """Add key to a refresh queue, refresh at :refresh_time: or later"""
         if self.cache_refresh_task is None or self.cache_refresh_task.done() or self.cache_refresh_task.cancelled():
             self.cache_refresh_task = asyncio.create_task(self._refresh_stale_cache_entries())
-            logger.debug("Spawned cache refresh task.")
+            logger.debug("Spawned cache refresh task")
         earliest_key, earliest_item = self.cache_refresh_queue.top()
         if earliest_item is None or refresh_time < earliest_item.expiration_time:
             self.cache_refresh_evt.set()  # if we new element is now earliest, notify the cache queue

+ 1 - 1
hivemind/dht/routing.py

@@ -217,7 +217,7 @@ class KBucket:
 
     def __delitem__(self, node_id: DHTID):
         if not (node_id in self.nodes_to_peer_id or node_id in self.replacement_nodes):
-            raise KeyError(f"KBucket does not contain node id={node_id}.")
+            raise KeyError(f"KBucket does not contain node id={node_id}")
 
         if node_id in self.replacement_nodes:
             del self.replacement_nodes[node_id]

+ 1 - 1
hivemind/hivemind_cli/run_server.py

@@ -29,7 +29,7 @@ def main():
                         help="specify the exact list of expert uids to create. Use either this or num_experts"
                              " and expert_pattern, not both")
     parser.add_argument('--expert_cls', type=str, default='ffn', required=False,
-                        help="expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'.")
+                        help="expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'")
     parser.add_argument('--hidden_dim', type=int, default=1024, required=False, help='main dimension for expert_cls')
 
     parser.add_argument('--num_handlers', type=int, default=None, required=False,

+ 2 - 2
hivemind/moe/client/moe.py

@@ -238,7 +238,7 @@ class _RemoteCallMany(torch.autograd.Function):
             pending_tasks, num_samples, k_min, forward_timeout, timeout_after_k_min, detect_anomalies
         )
         if len(responded_inds) < k_min:
-            raise TimeoutError(f"Forward pass: less than {k_min} responded within timeout.")
+            raise TimeoutError(f"Forward pass: less than {k_min} responded within timeout")
 
         if not isinstance(info["outputs_schema"], tuple):
             outputs_schema = (info["outputs_schema"],)
@@ -330,7 +330,7 @@ class _RemoteCallMany(torch.autograd.Function):
             pending_tasks, num_samples, backward_k_min, backward_timeout, timeout_after_k_min, detect_anomalies
         )
         if len(survivor_inds) < backward_k_min:
-            raise TimeoutError(f"Backward pass: less than {backward_k_min} experts responded within timeout.")
+            raise TimeoutError(f"Backward pass: less than {backward_k_min} experts responded within timeout")
 
         # assemble responses
         batch_inds, expert_inds = map(

+ 2 - 2
hivemind/moe/server/__init__.py

@@ -333,7 +333,7 @@ def background_server(*args, shutdown_timeout=5, **kwargs) -> Tuple[hivemind.End
         if runner.is_alive():
             logger.info("Server failed to shutdown gracefully, terminating it the hard way...")
             runner.kill()
-            logger.info("Server terminated.")
+            logger.info("Server terminated")
 
 
 def _server_runner(pipe, *args, **kwargs):
@@ -353,4 +353,4 @@ def _server_runner(pipe, *args, **kwargs):
         logger.info("Shutting down server...")
         server.shutdown()
         server.join()
-        logger.info("Server shut down.")
+        logger.info("Server shut down")

+ 7 - 7
hivemind/optim/collaborative.py

@@ -233,7 +233,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
         :note: this .step is different from normal pytorch optimizers in several key ways. See __init__ for details.
         """
         if grad_scaler is not None and not isinstance(grad_scaler, HivemindGradScaler):
-            raise ValueError("CollaborativeOptimizer requires a hivemind-aware gradient scaler (HivemindGradScaler).")
+            raise ValueError("CollaborativeOptimizer requires a hivemind-aware gradient scaler (HivemindGradScaler)")
         if self.batch_size_per_step is None:
             if batch_size is None:
                 raise ValueError("Please either set batch_size_per_step parameter at init or when calling .step")
@@ -242,12 +242,12 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
         batch_size = batch_size if batch_size is not None else self.batch_size_per_step
 
         if not self.is_synchronized and not self.is_within_tolerance:
-            logger.log(self.status_loglevel, "Peer is out of sync.")
+            logger.log(self.status_loglevel, "Peer is out of sync")
             self.load_state_from_peers()
             return
         elif not self.is_synchronized and self.is_within_tolerance:
             self.averager.local_step = self.collaboration_state.optimizer_step
-            logger.log(self.status_loglevel, f"Catching up with collaboration step {self.local_step}.")
+            logger.log(self.status_loglevel, f"Catching up with collaboration step {self.local_step}")
 
         if grad_scaler is not None and not grad_scaler.are_grads_finite(self):
             logger.log(self.status_loglevel, "Encountered incorrect value in fp16 grads, resetting local gradients")
@@ -304,12 +304,12 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
                                 logger.warning(f"Peer {peer} sent malformed data about current step: {peer_step}")
 
                 except BaseException as e:
-                    logger.log(self.status_loglevel, f"Skipped averaging: averaging round failed with {repr(e)}.")
+                    logger.log(self.status_loglevel, f"Skipped averaging: averaging round failed with {repr(e)}")
 
             else:
                 logger.log(
                     self.status_loglevel,
-                    f"Skipped averaging: collaboration consists of " f"{self.collaboration_state.num_peers} peer(s).",
+                    f"Skipped averaging: collaboration consists of " f"{self.collaboration_state.num_peers} peer(s)",
                 )
 
             if grad_scaler is not None:
@@ -365,7 +365,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
                         else:
                             logger.warning(f"Peer {peer} sent malformed data about current step: {peer_step}")
             except BaseException as e:
-                logger.log(self.status_loglevel, f"Skipped averaging: averaging round failed with {repr(e)}.")
+                logger.log(self.status_loglevel, f"Skipped averaging: averaging round failed with {repr(e)}")
 
             self.collaboration_state.register_step(current_step + 1)
             self.averager.local_step = current_step + 1
@@ -552,7 +552,7 @@ class CollaborativeOptimizer(DecentralizedOptimizerBase):
             value=None,
             expiration_time=get_dht_time() + self.metadata_expiration,
         )
-        logger.debug(f"{self.__class__.__name__} is shut down.")
+        logger.debug(f"{self.__class__.__name__} is shut down")
 
     def __del__(self):
         self.shutdown()

+ 5 - 5
hivemind/optim/experimental/grad_averager.py

@@ -119,7 +119,7 @@ class GradientAverager(DecentralizedAverager):
         if self._accumulators_used_in_step and self.warn:
             logger.warning(
                 "[warn=True] Gradient accumulators were not reset since the last averaging round. Please "
-                "call .reset_accumulated_grads_ after every step or use .step(reset_accumulators=True)."
+                "call .reset_accumulated_grads_ after every step or use .step(reset_accumulators=True)"
             )
             self._accumulators_used_in_step = False  # warn once per round
         if self._anchor_batch_size is None:
@@ -168,12 +168,12 @@ class GradientAverager(DecentralizedAverager):
         if control is None:
             control = self.schedule_step(timeout=timeout, **kwargs)
         elif len(kwargs) > 0:
-            raise RuntimeError(f"Averaging with a pre-scheduled group, parameters {kwargs} will have no effect.")
-        assert not control.triggered, f"This {type(control)} instance was already used."
+            raise RuntimeError(f"Averaging with a pre-scheduled group, parameters {kwargs} will have no effect")
+        assert not control.triggered, f"This {type(control)} instance was already used"
         if self._new_averaged_grads and self.warn:
             logger.warning(
-                "[warn=True] Starting new averaging round, but previous round results were not used."
-                "This may be a sign of incorrect optimizer behavior."
+                "[warn=True] Starting new averaging round, but previous round results were not used. "
+                "This may be a sign of incorrect optimizer behavior"
             )
 
         self.load_accumulators_into_averager_()

+ 9 - 9
hivemind/optim/experimental/optimizer.py

@@ -377,7 +377,7 @@ class Optimizer(torch.optim.Optimizer):
                 loss = closure()
 
         if not self.auxiliary and self.should_load_state_from_peers():
-            logger.log(self.status_loglevel, "Peer is out of sync.")
+            logger.log(self.status_loglevel, "Peer is out of sync")
             self.load_state_from_peers()
             return loss  # local gradients were computed with out-of-sync parameters, must start over
 
@@ -481,7 +481,7 @@ class Optimizer(torch.optim.Optimizer):
                 if not self.client_mode:
                     self.grad_averager.state_sharing_priority = self.local_epoch
 
-            logger.log(self.status_loglevel, f"Transitioning to epoch {self.local_epoch}.")
+            logger.log(self.status_loglevel, f"Transitioning to epoch {self.local_epoch}")
 
     def _begin_averaging_gradients(self, grad_scaler: Optional[GradScaler]) -> bool:
         """Begin an all-reduce round to average gradients; return True if succeeded, False if failed"""
@@ -537,7 +537,7 @@ class Optimizer(torch.optim.Optimizer):
 
                 eta_seconds = self.tracker.estimated_next_update_time - get_dht_time()
                 eta_seconds = max(eta_seconds, self.grad_averager.matchmaking_kwargs["min_matchmaking_time"])
-                logger.log(self.status_loglevel, f"Pre-scheduling gradient averaging round in {eta_seconds:.2f}s.")
+                logger.log(self.status_loglevel, f"Pre-scheduling gradient averaging round in {eta_seconds:.2f} sec")
                 self.scheduled_grads = self.grad_averager.schedule_step(timeout=self.averaging_timeout)
 
     def _maybe_schedule_state_averaging(self) -> None:
@@ -558,7 +558,7 @@ class Optimizer(torch.optim.Optimizer):
 
                 min_matchmaking_time = self.state_averager.matchmaking_kwargs["min_matchmaking_time"]
                 actual_seconds = max(eta_seconds_to_averaging, min_matchmaking_time)
-                logger.log(self.status_loglevel, f"Pre-scheduling state averaging round in {actual_seconds:.2f}s.")
+                logger.log(self.status_loglevel, f"Pre-scheduling state averaging round in {actual_seconds:.2f} sec")
                 self.scheduled_state = self.state_averager.schedule_step(
                     gather=next_epoch, timeout=self.averaging_timeout
                 )
@@ -606,7 +606,7 @@ class Optimizer(torch.optim.Optimizer):
         if self.use_gradient_averaging and self.grad_averager.reuse_grad_buffers:
             raise ValueError(
                 f"When running {self.__class__.__name__} with reuse_grad_buffers=True, user should never "
-                f"call zero_grad manually. Gradients will be refreshed internally."
+                f"call zero_grad manually. Gradients will be refreshed internally"
             )
         for param_group in self.param_groups:
             for param in param_group["params"]:
@@ -658,7 +658,7 @@ class Optimizer(torch.optim.Optimizer):
                     continue
 
             if self.tracker.global_epoch - 1 <= self.local_epoch < self.tracker.global_epoch:
-                logger.log(self.status_loglevel, f"Catching up with collaboration step {self.tracker.global_epoch}.")
+                logger.log(self.status_loglevel, f"Catching up with collaboration step {self.tracker.global_epoch}")
                 self.state_averager.local_epoch = self.tracker.global_epoch
 
             self.tracker.report_local_progress(local_epoch=self.local_epoch, samples_accumulated=0)
@@ -722,8 +722,8 @@ class Optimizer(torch.optim.Optimizer):
 
     def add_param_group(self, param_group: dict) -> None:
         raise ValueError(
-            f"{self.__class__.__name__} does not support calling add_param_group after creation."
-            f"Please provide all parameter groups at init."
+            f"{self.__class__.__name__} does not support calling add_param_group after creation. "
+            f"Please provide all parameter groups at init"
         )
 
     def __repr__(self):
@@ -738,7 +738,7 @@ class Optimizer(torch.optim.Optimizer):
         self.state_averager.shutdown()
         if self.use_gradient_averaging:
             self.grad_averager.shutdown()
-        logger.log(self.status_loglevel, f"{self.__class__.__name__} is shut down.")
+        logger.log(self.status_loglevel, f"{self.__class__.__name__} is shut down")
 
     def __del__(self):
         if self._parent_pid == os.getpid() and self.is_alive():

+ 4 - 4
hivemind/optim/experimental/progress_tracker.py

@@ -202,10 +202,10 @@ class ProgressTracker(threading.Thread):
                 logger.debug(f"Will report progress again in {wait_timeout} seconds or on user command")
                 await asyncio.get_event_loop().run_in_executor(None, self.should_report_progress.wait, wait_timeout)
                 if self.should_report_progress.is_set():
-                    logger.debug(f"Progress update triggered by report_local_progress.")
+                    logger.debug(f"Progress update triggered by report_local_progress")
                     self.should_report_progress.clear()
                 else:
-                    logger.debug(f"Progress update triggered by metadata_expiration.")
+                    logger.debug(f"Progress update triggered by metadata_expiration")
 
                 local_progress = self.local_progress
                 last_report_time = get_dht_time()
@@ -223,7 +223,7 @@ class ProgressTracker(threading.Thread):
                     )
                 )
         finally:
-            logger.log(self.status_loglevel, f"No longer reporting progress for {self.prefix}.")
+            logger.log(self.status_loglevel, f"No longer reporting progress for {self.prefix}")
             if store_task is not None:
                 store_task.cancel()
 
@@ -265,7 +265,7 @@ class ProgressTracker(threading.Thread):
                     self.fetched_global_progress_this_epoch.set()
 
         finally:
-            logger.log(self.status_loglevel, f"No longer fetching {self.training_progress_key}.")
+            logger.log(self.status_loglevel, f"No longer fetching {self.training_progress_key}")
 
     def _parse_swarm_progress_data(self, metadata: TrainingProgressSchema) -> GlobalTrainingProgress:
         """Read performance statistics reported by peers, estimate progress towards next batch"""

+ 4 - 4
hivemind/optim/experimental/state_averager.py

@@ -162,7 +162,7 @@ class TrainingStateAverager(DecentralizedAverager):
         """Create a new tensor for averaging or reuse the existing one"""
         if self.reuse_tensors and not force_copy:
             if source_tensor.device != torch.device("cpu"):
-                raise ValueError("reuse_tensors is only supported if all averaged tensors are on CPU.")
+                raise ValueError("reuse_tensors is only supported if all averaged tensors are on CPU")
             if not source_tensor.is_shared():
                 source_tensor.share_memory_()
             return source_tensor
@@ -369,7 +369,7 @@ class TrainingStateAverager(DecentralizedAverager):
                 # the possibly different gradients when wait_for_trigger has finished).
                 raise ValueError(
                     "wait_for_trigger is a low-level option that requires manual gradient manipulation. "
-                    "If you know what you're doing, please refer to the comments in the source code for details."
+                    "If you know what you're doing, please refer to the comments in the source code for details"
                 )
         output = None
 
@@ -578,7 +578,7 @@ class TrainingStateAverager(DecentralizedAverager):
         """Copy averaged tensors into their respective local tensors"""
         assert not self.reuse_tensors, "No need to update averaged tensors since they reuse the same memory"
         if self.delta_rule_averaging and self._old_tensors is None:
-            logger.warning("Using delta_rule_averaging, but old tensors were not found. Averaging may have failed.")
+            logger.warning("Using delta_rule_averaging, but old tensors were not found. Averaging may have failed")
         with self.get_tensors() as averaged_tensors:
             local_tensors = list(self._local_tensors())
             assert len(local_tensors) == len(averaged_tensors), "Tensor structure changed during training"
@@ -647,7 +647,7 @@ class TrainingStateAverager(DecentralizedAverager):
         loaded_parameters_and_extras = flat_tensors[:num_parameters_and_extras]
         loaded_opt_tensors = flat_tensors[num_parameters_and_extras:]
         if num_parameters_and_extras != len(loaded_parameters_and_extras):
-            logger.error("Failed to load state from peer, received parameters, extras or metadata.")
+            logger.error("Failed to load state from peer, received parameters, extras or metadata")
             return
 
         with torch.no_grad(), self.lock_averaged_tensors:

+ 1 - 1
hivemind/optim/grad_scaler.py

@@ -63,7 +63,7 @@ class GradScaler(TorchGradScaler):
         if self._is_running_global_step:
             with self._lock:
                 if self._is_ready_to_update:
-                    logger.warning("Please call grad_scaler.update() after each step.")
+                    logger.warning("Please call grad_scaler.update() after each step")
                 assert not isinstance(optimizer, (hivemind.Optimizer, hivemind.DecentralizedOptimizerBase))
                 assert (
                     self._per_optimizer_states[id(optimizer)]["stage"] == OptState.UNSCALED

+ 4 - 4
hivemind/optim/simple.py

@@ -131,16 +131,16 @@ class DecentralizedOptimizer(DecentralizedOptimizerBase):
                 time.sleep(time_to_nearest_interval)
 
             if verbose:
-                logger.info(f"Starting a new averaging round with current parameters.")
+                logger.info(f"Starting a new averaging round with current parameters")
             try:
                 group_info = averager.step(lock_parameters, **kwargs)
                 if verbose:
                     if group_info is not None:
-                        logger.info(f"Finished averaging round in with {len(group_info)} peers.")
+                        logger.info(f"Finished averaging round in with {len(group_info)} peers")
                     else:
-                        logger.warning(f"Averaging round failed: could not find group.")
+                        logger.warning(f"Averaging round failed: could not find group")
             except Exception as e:
-                logger.error(f"Averaging round failed: caught {e}.")
+                logger.error(f"Averaging round failed: caught {e}")
 
 
 class DecentralizedSGD(DecentralizedOptimizer):

+ 1 - 1
hivemind/optim/training_averager.py

@@ -101,7 +101,7 @@ class TrainingAverager(DecentralizedAverager):
                 self.pending_updates_done.clear()
                 with data_lock, self.get_tensors() as averaged_tensors:
                     if len(averaged_tensors) != len(local_tensors):
-                        raise RuntimeError("The number of optimized parameters should not change.")
+                        raise RuntimeError("The number of optimized parameters should not change")
 
                     if use_old_local_tensors:
                         # since tensors might have changed, we subtract old_local_tensor and add averaged. This prevents

+ 1 - 1
hivemind/utils/mpfuture.py

@@ -204,7 +204,7 @@ class MPFuture(base.Future, Generic[ResultType]):
             with MPFuture._update_lock if self._use_lock else nullcontext():
                 self._sender_pipe.send((self._uid, update_type, payload))
         except (ConnectionError, BrokenPipeError, EOFError, OSError) as e:
-            logger.debug(f"No updates were sent: pipe to origin process was broken ({e}).", exc_info=True)
+            logger.debug(f"No updates were sent: pipe to origin process was broken ({e})", exc_info=True)
 
     def set_result(self, result: ResultType):
         if os.getpid() == self._origin_pid:

+ 2 - 2
hivemind/utils/serializer.py

@@ -35,7 +35,7 @@ class MSGPackSerializer(SerializerBase):
                 getattr(wrapped_type, "unpackb", None)
             ), f"Every ext_type must have 2 methods: packb(self) -> bytes and classmethod unpackb(cls, bytes)"
             if type_code in cls._ext_type_codes:
-                logger.warning(f"{cls.__name__}: type {type_code} is already registered, overwriting.")
+                logger.warning(f"{cls.__name__}: type {type_code} is already registered, overwriting")
             cls._ext_type_codes[type_code], cls._ext_types[wrapped_type] = wrapped_type, type_code
             return wrapped_type
 
@@ -60,7 +60,7 @@ class MSGPackSerializer(SerializerBase):
         elif type_code == cls._TUPLE_EXT_TYPE_CODE:
             return tuple(msgpack.unpackb(data, ext_hook=cls._decode_ext_types, raw=False))
 
-        logger.warning(f"Unknown ExtType code: {type_code}, leaving it as is.")
+        logger.warning(f"Unknown ExtType code: {type_code}, leaving it as is")
         return data
 
     @classmethod