2 yıl önce · 2a71fdab6f
--- a/.github/workflows/check-style.yml
+++ b/.github/workflows/check-style.yml
@@ -24,3 +24,11 @@ jobs:
 
				       - uses: isort/isort-action@master
			
 
				         with:
			
 
				           isortVersion: "5.10.1"
			
 
				+
			
 
				+  codespell:
			
 
				+    runs-on: ubuntu-latest
			
 
				+    steps:
			
 
				+      - uses: actions/checkout@v2
			
 
				+      - uses: codespell-project/actions-codespell@v1
			
 
				+        with:
			
 
				+          only_warn: 1
			
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -38,7 +38,8 @@ with the following rules:
 
				   cannot be longer than 119 characters.
			
 
				 * We use [black](https://github.com/psf/black) for code formatting and [isort](https://github.com/PyCQA/isort) for 
			
 
				   import sorting. Before submitting a PR, make sure to install and run `black .` and `isort .` in the root of the
			
 
				-  repository.
			
 
				+  repository. Also, you may want to check your code for typos by running `codespell --skip=".git"`, though there
			
 
				+  might be false positives.
			
 
				 * We highly encourage the use of [typing](https://docs.python.org/3/library/typing.html) where applicable.
			
 
				 * Use `get_logger` from `hivemind.utils.logging` to log any information instead of `print`ing directly to standard
			
 
				   output/error streams.
			
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ see the [full list](#citation) of our papers below.
 
				 ## Example Use Cases
			
 
				 
			
 
				 This section lists projects that leverage hivemind for decentralized training. 
			
 
				-If you have succesfully trained a model or created a downstream repository with the help of our library, 
			
 
				+If you have successfully trained a model or created a downstream repository with the help of our library, 
			
 
				 feel free to submit a pull request that adds your project to this list.
			
 
				 
			
 
				 * **Petals** ([webpage](https://petals.ml), [code](https://github.com/bigscience-workshop/petals)) — a decentralized platform for inference and fine-tuning of 100B+ language models.
			
--- a/benchmarks/benchmark_dht.py
+++ b/benchmarks/benchmark_dht.py
@@ -51,7 +51,7 @@ async def store_and_get_task(
 
				     latest: bool,
			
 
				     node_killer: NodeKiller,
			
 
				 ) -> Tuple[list, list, list, list, int, int]:
			
 
				-    """Iteratively choose random peers to store data onto the dht, then retreive with another random subset of peers"""
			
 
				+    """Iteratively choose random peers to store data onto the dht, then retrieve with another random subset of peers"""
			
 
				 
			
 
				     total_stores = total_gets = 0
			
 
				     successful_stores = []
			
--- a/docs/modules/optim.rst
+++ b/docs/modules/optim.rst
@@ -5,7 +5,7 @@
 
				 
			
 
				   This module contains decentralized optimizers that wrap your regular PyTorch Optimizer to train with peers.
			
 
				   Depending on the exact configuration, Optimizer may perform large synchronous updates equivalent,
			
 
				-  or perform asynchrnous local updates and average model parameters.
			
 
				+  or perform asynchronous local updates and average model parameters.
			
 
				 
			
 
				   <br><br>
			
 
				 
			
--- a/docs/user/dht.md
+++ b/docs/user/dht.md
@@ -119,7 +119,7 @@ dht = hivemind.DHT(
 
				     ], start=True)
			
 
				 ```
			
 
				 
			
 
				-Thats it, now the two DHT nodes are connected. If you connect additional peers to the network, you only need to specify
			
 
				+That's it, now the two DHT nodes are connected. If you connect additional peers to the network, you only need to specify
			
 
				 one (or a subset) of peers as `initial_peers`.
			
 
				 In case your peer operates behind a restrictive firewall, you may find it beneficial to set `client_mode=True`. In this
			
 
				  case, the DHT instance will access others, but it will not announce that other peers can connect to it.
			
--- a/hivemind/averaging/averager.py
+++ b/hivemind/averaging/averager.py
@@ -62,7 +62,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				     :param min_matchmaking_time: when looking for group, wait for requests for at least this many seconds
			
 
				     :param compression: optionally compress tensors with this compression algorithm before running all-reduce
			
 
				     :param state_compression: a separate compression strategy for load_state_from_peers (default = no compression)
			
 
				-    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be comressed
			
 
				+    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be compressed
			
 
				     :param averaging_alpha: optional "learning rate" for averaging. If specified, local parameters will be shifted
			
 
				       towards the (estimated) average by this coefficient. By default, local parameters are set equal to average.
			
 
				     :param request_timeout: when looking for group, wait for a response from leader for at most this many seconds.
			
@@ -376,7 +376,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				         """
			
 
				         Set up the averager to look for a group and run one round of averaging, return True on success, False on failure
			
 
				 
			
 
				-        :param gather: optionally send this informaton to all peers in the next group and gather it from every groupmate
			
 
				+        :param gather: optionally send this information to all peers in the next group and gather it from every groupmate
			
 
				           (this operation is known as all-gather). The gathered data will be available as the output of this function.
			
 
				         :param scheduled_time: when matchmaking, assume that all-reduce will begin at this moment.
			
 
				           By default, schedule all-reduce current time plus min_matchmaking_time seconds
			
@@ -651,7 +651,7 @@ class DecentralizedAverager(mp.Process, ServicerBase):
 
				 
			
 
				     def get_current_state(self) -> Tuple[Any, Sequence[torch.Tensor], Sequence[CompressionInfo]]:
			
 
				         """
			
 
				-        Get current state and send it to a peer. executed in the host process. Meant to be overriden.
			
 
				+        Get current state and send it to a peer. executed in the host process. Meant to be overridden.
			
 
				         :returns: a tuple of (small metadata, sequence of torch tensors)
			
 
				         :note: metadata must be seriablizable with self.serializer (default = MSGPackSerializer)
			
 
				         """
			
--- a/hivemind/averaging/partition.py
+++ b/hivemind/averaging/partition.py
@@ -26,7 +26,7 @@ class TensorPartContainer:
 
				     :param peer_fractions: for each peer, a target fraction of vector elements that this peer should average
			
 
				     :param compression: optionally compress tensors with this compression algorithm before sending them to peers
			
 
				     :param part_size_bytes: greedily split tensors into parts of up to this many bytes (after compression)
			
 
				-    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be comressed
			
 
				+    :param tensor_infos: CompressionInfo for each respective tensor; this determines how the tensor will be compressed
			
 
				     :param return_deltas: if True, output tensors are differences (aggregated tensor - local tensor)
			
 
				     :param prefetch: when compressing, pre-compute this many compressed tensors in background
			
 
				     """
			
--- a/hivemind/compression/base.py
+++ b/hivemind/compression/base.py
@@ -53,7 +53,7 @@ class CompressionBase(ABC):
 
				         """
			
 
				         Applies compression algorithm to a tensor based on their meta-parameters
			
 
				 
			
 
				-        :param tensor: a pytorch tensor to compress; depending on the applicaiton, it is a full tensor or a part
			
 
				+        :param tensor: a pytorch tensor to compress; depending on the application, it is a full tensor or a part
			
 
				         :param info: meta-information about the tensor; if partitioning is used, this still describes the full tensor
			
 
				         :param allow_inplace: if True, compression can (but doesn't have to) to modify tensor in-place for efficiency
			
 
				         :returns: a protobuf message that encodes the tensor
			
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -586,7 +586,7 @@ class DHTNode:
 
				             If min_expiration_time=float('inf'), this method will find a value with _latest_ expiration
			
 
				         :param beam_size: maintains up to this many nearest nodes when crawling dht, default beam_size = bucket_size
			
 
				         :param num_workers: override for default num_workers, see traverse_dht num_workers param
			
 
				-        :param return_futures: if True, immediately return asyncio.Future for every before interacting with the nework.
			
 
				+        :param return_futures: if True, immediately return asyncio.Future for every before interacting with the network.
			
 
				          The algorithm will populate these futures with (value, expiration) when it finds the corresponding key
			
 
				          Note: canceling a future will stop search for the corresponding key
			
 
				         :param _is_refresh: internal flag, set to True by an internal cache refresher (if enabled)
			
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -1,4 +1,4 @@
 
				-""" Utlity data structures to represent DHT nodes (peers), data keys, and routing tables. """
			
 
				+""" Utility data structures to represent DHT nodes (peers), data keys, and routing tables. """
			
 
				 from __future__ import annotations
			
 
				 
			
 
				 import hashlib
			
--- a/hivemind/moe/server/server.py
+++ b/hivemind/moe/server/server.py
@@ -302,7 +302,7 @@ class Server(threading.Thread):
 
				         logger.debug(f"Shutting down runtime")
			
 
				         self.runtime.shutdown()
			
 
				 
			
 
				-        logger.info("Server shutdown succesfully")
			
 
				+        logger.info("Server shutdown successfully")
			
 
				 
			
 
				 
			
 
				 @contextmanager
			
--- a/hivemind/optim/grad_averager.py
+++ b/hivemind/optim/grad_averager.py
@@ -29,7 +29,7 @@ class GradientAverager(DecentralizedAverager):
 
				     (3) averaged gradients - gradient buffers that are aggregated in-place with peers, always in host memory
			
 
				 
			
 
				     :param parameters: pytorch parameters for which to aggregate gradients
			
 
				-    :param dht: a DHT isntance connected to the rest of the swarm. See hivemind.DHT docs
			
 
				+    :param dht: a DHT instance connected to the rest of the swarm. See hivemind.DHT docs
			
 
				     :param prefix: a unique DHT key used for matchmaking. E.g. this can be your experiment name with optional suffixes
			
 
				     :param reuse_grad_buffers: if True, use model's .grad buffers for accumulating gradients over multiple steps.
			
 
				       This is more memory efficient, but it requires that the user does *not* call zero_grad or clip_by_whatever at all
			
--- a/hivemind/optim/optimizer.py
+++ b/hivemind/optim/optimizer.py
@@ -56,11 +56,11 @@ class Optimizer(torch.optim.Optimizer):
 
				 
			
 
				     Unlike regular training, your device may join midway through training, when other peers already made some progress.
			
 
				     For this reason, any learning rate schedulers, curriculum and other **time-dependent features should be based on**
			
 
				-    ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late
			
 
				+    ``optimizer.local_epoch`` (and not the number of calls to opt.step). Otherwise, peers that joined training late
			
 
				     may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below.
			
 
				 
			
 
				     :What is an epoch?: Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch
			
 
				-      coresponds to processing certain number of training samples (``target_batch_size``) in total across all peers.
			
 
				+      corresponds to processing certain number of training samples (``target_batch_size``) in total across all peers.
			
 
				       Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.**
			
 
				       At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update,
			
 
				       updating the learning rate scheduler or simply averaging parameters (if using local updates).
			
--- a/hivemind/optim/power_sgd_averager.py
+++ b/hivemind/optim/power_sgd_averager.py
@@ -51,7 +51,7 @@ class PowerSGDGradientAverager(GradientAverager):
 
				 
			
 
				     :param parameters: pytorch parameters for which to aggregate gradients
			
 
				     :param averager_rank: rank of compressed gradients
			
 
				-    :param dht: a DHT isntance connected to the rest of the swarm. See hivemind.DHT docs
			
 
				+    :param dht: a DHT instance connected to the rest of the swarm. See hivemind.DHT docs
			
 
				     :param prefix: a unique DHT key used for matchmaking. E.g. this can be your experiment name with optional suffixes
			
 
				     :param reuse_grad_buffers: if True, use model's .grad buffers for accumulating gradients over multiple steps.
			
 
				       This is more memory efficient, but it requires that the user does *not* call zero_grad or clip_by_whatever at all
			
--- a/hivemind/utils/math.py
+++ b/hivemind/utils/math.py
@@ -15,7 +15,7 @@ def orthogonalize_(matrix, eps: float = 1e-8):
 
				 
			
 
				 
			
 
				 def get_flatten_greedy_dims(tensor: torch.Tensor, max_ndim: int = 2):
			
 
				-    """get dims to flatten tensor upto max_ndim dimensions by merging small axes together"""
			
 
				+    """get dims to flatten tensor up to max_ndim dimensions by merging small axes together"""
			
 
				     dims = list(tensor.shape)
			
 
				     while len(dims) > max_ndim:
			
 
				         squeeze_ix = min(range(len(dims) - 1), key=lambda i: dims[i] * dims[i + 1])
			
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -8,4 +8,5 @@ scikit-learn
 
				 torchvision
			
 
				 black==22.3.0
			
 
				 isort==5.10.1
			
 
				+codespell==2.2.2
			
 
				 psutil
			
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -356,7 +356,7 @@ def test_load_state_from_peers():
 
				     class TestAverager(DecentralizedAverager):
			
 
				         def get_current_state(self):
			
 
				             """
			
 
				-            Get current state and send it to a peer. executed in the host process. Meant to be overriden.
			
 
				+            Get current state and send it to a peer. executed in the host process. Meant to be overridden.
			
 
				             :returns: a tuple of (serializable_small_metadata, sequence of torch tensors)
			
 
				             """
			
 
				             nonlocal num_calls, super_metadata, super_tensors