4 vuotta sitten · 520562af4b
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,7 +1,7 @@
 
				 version: 2.1
			
 
				 
			
 
				 jobs:
			
 
				-  build-and-test-py3-8-1:
			
 
				+  build-and-test-py38:
			
 
				     docker:
			
 
				       - image: circleci/python:3.8.1
			
 
				     steps:
			
@@ -21,10 +21,7 @@ jobs:
 
				       - run:
			
 
				           command: pytest ./tests
			
 
				           name: tests
			
 
				-      - run:
			
 
				-          command: codecov
			
 
				-          name: codecov
			
 
				-  build-and-test-py3-9-1:
			
 
				+  build-and-test-py39:
			
 
				     docker:
			
 
				       - image: circleci/python:3.9.1
			
 
				     steps:
			
@@ -44,12 +41,9 @@ jobs:
 
				       - run:
			
 
				           command: pytest ./tests
			
 
				           name: tests
			
 
				-      - run:
			
 
				-          command: codecov
			
 
				-          name: codecov
			
 
				 
			
 
				 workflows:
			
 
				   main:
			
 
				     jobs:
			
 
				-      - build-and-test-py3-8-1
			
 
				-      - build-and-test-py3-9-1
			
 
				+      - build-and-test-py38
			
 
				+      - build-and-test-py39
			
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,143 @@
 
				+# Contributing to hivemind
			
 
				+
			
 
				+This document covers the technical details of making your contributions to the code of hivemind. For other ways to
			
 
				+contribute, read the [contributing guide](https://learning-at-home.readthedocs.io/en/latest/user/contributing.html) in
			
 
				+our documentation.
			
 
				+
			
 
				+Before you begin, file a new issue on [GitHub](https://github.com/learning-at-home/hivemind/issues) or announce that you
			
 
				+are going to work on an existing one to avoid duplicate effort. After you finish, submit a pull request and wait for it
			
 
				+to be reviewed by the library maintainers (and possibly other community members).
			
 
				+
			
 
				+## Environment setup
			
 
				+
			
 
				+First, install hivemind in the development mode, preferably with Python 3.8 on Linux.
			
 
				+
			
 
				+```
			
 
				+git clone https://github.com/learning-at-home/hivemind
			
 
				+cd hivemind
			
 
				+pip install -e .
			
 
				+``` 
			
 
				+
			
 
				+## Pull Request checklist
			
 
				+
			
 
				+To make sure that the reviewers will request minimal changes in your PR, you can check that your contribution complies
			
 
				+with the following rules:
			
 
				+
			
 
				+* All code changes are consistent with the repository [code style](#code-style).
			
 
				+* The title and the description of your pull request adhere to the formatting guidelines both
			
 
				+  for [pull requests](#pull-requests) and for [commit messages](#commit-messages).
			
 
				+* New modules or functions are sufficiently [documented](#building-documentation) and covered
			
 
				+  with [tests](#running-tests).
			
 
				+* The CI pipelines both for the documentation and for tests pass successfully.
			
 
				+* If you make performance-sensitive changes, their impact is measured with [benchmarks](#running-benchmarks) (the more,
			
 
				+  the better).
			
 
				+
			
 
				+## Code style
			
 
				+
			
 
				+* The code must follow [PEP8](https://www.python.org/dev/peps/pep-0008/) unless absolutely necessary. Also, each line
			
 
				+  cannot be longer than 120 characters.
			
 
				+* We highly encourage the use of [typing](https://docs.python.org/3/library/typing.html) where applicable.
			
 
				+* Use `get_logger` from `hivemind.utils.logging` to log any information instead of `print`ing directly to standard
			
 
				+  output/error streams.
			
 
				+* Comments should be used sparingly and never describe the obvious; usually it's best to clean up the code logic instead
			
 
				+  of describing it, as it might lead to redundant (or worse, stale or incorrect).
			
 
				+* In general, strive for code readability instead of compactness. In particular, prefer to create a new variable instead
			
 
				+  of a long one-liner and to break up a long method into several meaningful parts. This rule can be overridden in case
			
 
				+  of major performance considerations, but only if verified by benchmarks.
			
 
				+* Each user-facing function must have a [correct](#building-documentation) docstring that describes the intended usage,
			
 
				+  the input arguments and the return value. Both in comments and docstrings, please try to follow the capitalization
			
 
				+  rules for all terms and objects and to use proper grammar.
			
 
				+
			
 
				+## Contribution formatting guidelines
			
 
				+
			
 
				+To make sure that each change to hivemind is consistent across the entire project history and is easy to review by any
			
 
				+community member, follow these guidelines when submitting your pull request and writing commit messages. The library
			
 
				+maintainers use the same rules when merging your commits into the master branch after the PR approval.
			
 
				+
			
 
				+### Commit messages
			
 
				+
			
 
				+To ensure a consistent format across the entire repository history, please follow the following rules when formatting
			
 
				+your commits (especially for PR merge commits):
			
 
				+
			
 
				+* Keep the subject line short, preferably under 50 characters.
			
 
				+* Capitalize the subject line and do not end it with a period.
			
 
				+* If possible, write a conceptual description of your commit in the body (why?) instead
			
 
				+
			
 
				+It is not required to use this format while you are still working on your pull request. However, each merged PR commit
			
 
				+message has to adhere to these guidelines, and it will be easier for the maintainers to accept the PR if you have
			
 
				+already done most of the necessary formatting work.
			
 
				+
			
 
				+For further reading on the commit message format, see this [guide](https://chris.beams.io/posts/git-commit/#seven-rules)
			
 
				+on good Git commit messages, as well as this [repository](https://github.com/RomuloOliveira/commit-messages-guide).
			
 
				+
			
 
				+### Pull requests
			
 
				+
			
 
				+All commits from a pull request are squashed before merging to ensure a clean commit history in the master branch. The
			
 
				+merge commit title is the name of the pull request along with the PR number reference; the merge commit body is either
			
 
				+the pull request description (if it adheres to the format) or a cleaned up compilation of PR branch commit messages.
			
 
				+
			
 
				+* As such, the name and the description of your PR should follow the same guidelines as commit messages.
			
 
				+* Try to make your pull requests more narrow in scope and split significant changes to the code base in separate pieces.
			
 
				+  This will ensure [faster and better](https://essenceofcode.com/2019/10/29/the-art-of-small-pull-requests/) feedback
			
 
				+  from the reviewers.
			
 
				+* In particular, try to separate functional and non-functional code changes, as well as independent functional changes
			
 
				+  if they make the pull request too large to review in a short period of time.
			
 
				+* In general, when naming a pull request instead of a commit, it's best to highlight the major change in its title
			
 
				+  instead of listing all modifications. Also, if a pull request makes significant changes to the library, it's best to
			
 
				+  give a high-level description in the title instead of a technical one:
			
 
				+  compare `Implement decentralized parameter averaging` with `Add hivemind.client.averaging`.
			
 
				+
			
 
				+For more on the philosophy of easy-to-review pull requests, read these
			
 
				+guides: [1](https://mtlynch.io/code-review-love/) [2](https://www.atlassian.com/blog/git/written-unwritten-guide-pull-requests)
			
 
				+. If the changelist is not very large (more than a hundred lines) already, we encourage making small improvements to the
			
 
				+codebase in the files already changed by the PR; however, they should not dilute its major purpose.
			
 
				+
			
 
				+## Running tests
			
 
				+
			
 
				+Hivemind uses [pytest](https://github.com/pytest-dev/pytest/) for testing the behavior of the library modules. If you
			
 
				+implement a new part of the library, you are expected to write a test for the correctness of its implementation. If you
			
 
				+discovered a bug in the existing code base and intend to fix it, it's also best if you add the steps to reproduce it as
			
 
				+a new test to make sure it's not reintroduced by future changes.
			
 
				+
			
 
				+To run tests, you need to install hivemind in development mode with additional dependencies: `pip install -e .[dev]`.
			
 
				+You can run all tests with `pytest tests/` or choose a specific subset, e.g., `pytest tests/test_dht.py`.
			
 
				+
			
 
				+## Building documentation
			
 
				+
			
 
				+Any function exposed to a user must have a docstring compatible
			
 
				+with [Sphinx](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html), which is used for building the
			
 
				+online documentation.
			
 
				+
			
 
				+To build the docs locally,
			
 
				+
			
 
				+1. `pip install -e .[docs]`
			
 
				+2. make sure you ran setup.py (see above)
			
 
				+3. `cd ./docs && make html`
			
 
				+
			
 
				+The documentation root will be available in `./docs/_build/html/index.html`
			
 
				+
			
 
				+## Running benchmarks
			
 
				+
			
 
				+Currently, hivemind has three benchmark scripts for evaluating the impact of code changes on the most
			
 
				+performance-sensitive parts of the library. If you make a change that might introduce a regression, you may be asked by
			
 
				+the maintainers to provide the benchmarking results for your branch and a comparison with the master branch.
			
 
				+
			
 
				+* `tests/benchmark_averaging.py` measures the performance of decentralized parameter averaging across the DHT.
			
 
				+* `tests/benchmark_dht.py` measures the performance of core DHT operations.
			
 
				+* `tests/benchmark_throughput.py` measures the performance of a server hosting several expert layers under heavy load
			
 
				+  from multiple clients.
			
 
				+
			
 
				+Example benchmark runs are available in
			
 
				+the [benchmarking](https://learning-at-home.readthedocs.io/en/latest/user/benchmarks.html) page of the documentation.
			
 
				+
			
 
				+## See also
			
 
				+
			
 
				+For more details on overall contributions, visit the contributing guide at:
			
 
				+
			
 
				+https://learning-at-home.readthedocs.io/en/latest/user/contributing.html
			
 
				+
			
 
				+This guide was inspired by several influential Python open source projects listed below:
			
 
				+
			
 
				+* [PyTorch](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md)
			
 
				+* [Scikit-learn](https://scikit-learn.org/dev/developers/contributing.html)
			
 
				+* [transformers](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md)
			
--- a/README.md
+++ b/README.md
@@ -1,45 +1,102 @@
 
				-## hivemind: decentralized deep learning in PyTorch
			
 
				+## Hivemind: decentralized deep learning in PyTorch
			
 
				+
			
 
				 [![Build status](https://circleci.com/gh/learning-at-home/hivemind.svg?style=shield)](https://circleci.com/gh/learning-at-home/hivemind)
			
 
				 [![Documentation Status](https://readthedocs.org/projects/learning-at-home/badge/?version=latest)](https://learning-at-home.readthedocs.io/en/latest/?badge=latest)
			
 
				 [![Gitter](https://badges.gitter.im/learning-at-home/hivemind.svg)](https://gitter.im/learning-at-home/hivemind?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
			
 
				 
			
 
				-Hivemind is a PyTorch library to train large neural networks across the Internet. Imagine training one huge Transformer model
			
 
				-  on thousands of computers from different universities, companies, and volunteers.
			
 
				+Hivemind is a PyTorch library to train large neural networks across the Internet. Its intended usage is training a
			
 
				+single Transformer model on hundreds of computers from different universities, companies, and volunteers.
			
 
				 
			
 
				 ![img](https://i.imgur.com/GPxolxb.gif)
			
 
				 
			
 
				-### Key Features
			
 
				- * Train neural networks of arbitrary size: parts of their layers are distributed across the participants
			
 
				- * Run distributed training without master node: Distributed Hash Table allows to connect computers in a decentralized network
			
 
				- * Fault-tolerant backpropagation: forward and backward passes succeed even if some nodes are unresponsive or take too long to respond
			
 
				+## Key Features
			
 
				+
			
 
				+* Train neural networks of arbitrary size: parts of their layers are distributed across the participants.
			
 
				+* Distributed training without a master node: Distributed Hash Table allows connecting computers in a decentralized
			
 
				+  network.
			
 
				+* Fault-tolerant backpropagation: forward and backward passes succeed even if some nodes are unresponsive or take too
			
 
				+  long to respond.
			
 
				+* Decentralized parameter averaging: iteratively aggregate updates from multiple workers without the need to synchronize
			
 
				+  across the entire network.
			
 
				+
			
 
				+To learn more about the ideas behind this library, see https://learning-at-home.github.io or read
			
 
				+the [NeurIPS 2020 paper](https://arxiv.org/abs/2002.04013).
			
 
				+
			
 
				+## Installation
			
 
				+
			
 
				+Before installing hivemind, make sure that your environment has Python 3.8+
			
 
				+and [PyTorch](https://pytorch.org/get-started/locally/#start-locally) with a version at least as new as 1.6.0.
			
 
				+
			
 
				+To start using this library, you can either use the pip package manager or build it from source. Since currently the
			
 
				+release cycle is not established yet, we recommend installing hivemind from source to keep up with the latest bugfixes
			
 
				+and improvements.
			
 
				+
			
 
				+### With pip
			
 
				+
			
 
				+If your versions of Python and PyTorch match the requirements, you can install hivemind from pip:
			
 
				+
			
 
				+```
			
 
				+pip install hivemind
			
 
				+```
			
 
				+
			
 
				+### From source
			
 
				+
			
 
				+To install hivemind from source, simply clone the repository and install
			
 
				+
			
 
				+```
			
 
				+git clone https://github.com/learning-at-home/hivemind.git
			
 
				+cd hivemind
			
 
				+pip install .
			
 
				+```
			
 
				+
			
 
				+If you would like to verify that your installation is working properly, you can install with `pip install -e .[dev]`
			
 
				+instead. Then, you can run the tests with `pytest tests/`.
			
 
				 
			
 
				-To learn more about the idea behind this library and its components, see https://learning-at-home.github.io or read the [NeurIPS 2020 paper](https://arxiv.org/abs/2002.04013)
			
 
				+## Documentation
			
 
				 
			
 
				-### Documentation
			
 
				- * [Quickstart tutorial](https://learning-at-home.readthedocs.io/en/latest/user/quickstart.html): install hivemind, 
			
 
				-    set up a server and train experts  
			
 
				- * Documentation & guides: [learning-at-home.readthedocs.io](https://learning-at-home.readthedocs.io)
			
 
				+* [Quickstart](https://learning-at-home.readthedocs.io/en/latest/user/quickstart.html): install hivemind, set up a
			
 
				+  server and train experts
			
 
				+* Documentation & guides are available at [learning-at-home.readthedocs.io](https://learning-at-home.readthedocs.io)
			
 
				 
			
 
				-### Contributing
			
 
				-Hivemind is currently at the active development stage, and we welcome all contributions from bug fixes and documentation improvements to entirely new features. 
			
 
				-If you want to contribute to hivemind, take a look at the issues or join [our chat room](https://gitter.im/learning-at-home/hivemind).
			
 
				-The [Developer's guide](https://learning-at-home.readthedocs.io/en/latest/user/contributing.html) page contains best practices, as well as description of tests and performance benchmarks.
			
 
				+## Contributing
			
 
				 
			
 
				-### References
			
 
				-You can read the paper that inspired hivemind here:
			
 
				+Hivemind is currently at the active development stage, and we welcome all contributions. Everything, from bug fixes and
			
 
				+documentation improvements to entirely new features, is equally appreciated.
			
 
				+
			
 
				+If you want to contribute to hivemind but don't know where to start, take a look at the
			
 
				+unresolved [issues](https://github.com/learning-at-home/hivemind/issues). Open a new issue or
			
 
				+join [our chat room](https://gitter.im/learning-at-home/hivemind) in case you want to discuss new functionality or
			
 
				+report a possible bug. Bug fixes are always welcome, but new features should be preferably discussed with maintainers
			
 
				+beforehand.
			
 
				+
			
 
				+If you want to start contributing to the source code of hivemind, please see
			
 
				+the [contributing guidelines](https://github.com/learning-at-home/hivemind/blob/master/CONTRIBUTING.md) first. To learn
			
 
				+more about other ways to contribute, read
			
 
				+our [guide](https://learning-at-home.readthedocs.io/en/latest/user/contributing.html).
			
 
				+
			
 
				+## Citation
			
 
				+
			
 
				+If you found hivemind useful for your experiments, you can cite [the paper](https://arxiv.org/abs/2002.04013) that
			
 
				+inspired it:
			
 
				 
			
 
				-[Towards Crowdsourced Training of Large Neural Networks using Decentralized Mixture-of-Experts](https://arxiv.org/abs/2002.04013) (Max Ryabinin and Anton Gusev, NeurIPS 2020).
			
 
				 ```
			
 
				-@misc{ryabinin2020crowdsourced,
			
 
				-      title={Towards Crowdsourced Training of Large Neural Networks using Decentralized Mixture-of-Experts}, 
			
 
				-      author={Max Ryabinin and Anton Gusev},
			
 
				-      year={2020},
			
 
				-      eprint={2002.04013},
			
 
				-      archivePrefix={arXiv},
			
 
				-      primaryClass={cs.DC}
			
 
				+@inproceedings{ryabinin2020crowdsourced,
			
 
				+ author = {Ryabinin, Max and Gusev, Anton},
			
 
				+ booktitle = {Advances in Neural Information Processing Systems},
			
 
				+ editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
			
 
				+ pages = {3659--3672},
			
 
				+ publisher = {Curran Associates, Inc.},
			
 
				+ title = {Towards Crowdsourced Training of Large Neural Networks using Decentralized Mixture-of-Experts},
			
 
				+ url = {https://proceedings.neurips.cc/paper/2020/file/25ddc0f8c9d3e22e03d3076f98d83cb2-Paper.pdf},
			
 
				+ volume = {33},
			
 
				+ year = {2020}
			
 
				 }
			
 
				 ```
			
 
				-The initial implementation of hivemind used to conduct experiments for the paper is available here: [mryab/learning-at-home](https://github.com/mryab/learning-at-home).
			
 
				 
			
 
				-In the docs, we list several [related](https://learning-at-home.readthedocs.io/en/latest/user/acknowledgements.html) projects and acknowledgements.
			
 
				+The initial implementation of hivemind used for the paper is available
			
 
				+at [mryab/learning-at-home](https://github.com/mryab/learning-at-home).
			
 
				+
			
 
				+In the documentation, we list
			
 
				+several [related](https://learning-at-home.readthedocs.io/en/latest/user/acknowledgements.html) projects and
			
 
				+acknowledgements.
			
 
				 
			
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,7 +5,7 @@
 
				   :scale: 48
			
 
				 
			
 
				 Hivemind is a library for decentralized deep learning computations. It allows you to train large neural networks using vast numbers
			
 
				-of computers, whether you're running a very capable computer or less reliable one.
			
 
				+of computers, whether you're running a very capable computer or a less reliable one.
			
 
				 Learn how to create or join a Hivemind run in the `quickstart tutorial <./user/quickstart.html>`__ or browse the API
			
 
				 documentation below.
			
 
				 
			
@@ -22,6 +22,7 @@ documentation below.
 
				   user/quickstart
			
 
				   modules/index
			
 
				   user/contributing
			
 
				+  user/benchmarks
			
 
				   user/acknowledgements
			
 
				 
			
 
				 Indices and tables
			
--- a/docs/user/acknowledgements.md
+++ b/docs/user/acknowledgements.md
@@ -1,26 +1,28 @@
 
				-## Credits
			
 
				+# Credits
			
 
				 
			
 
				 We kindly thank (in random order)
			
 
				-* [Artem Babenko](https://research.yandex.com/people/102794) and 
			
 
				-  [Vladimir Aliev](https://ru.linkedin.com/in/vladimir-aliev-19b93282) (yandex research) for helpful discussions
			
 
				-  and editorial review of the paper,
			
 
				-* [Jacob R. Steeves](https://github.com/unconst) (bittensor) for discussions on RPC frameworks and NAT traversal and 
			
 
				-  peer-to-peer technologies. 
			
 
				-* [Dmitry Afanasiev](https://www.linkedin.com/in/dmitry-afanasiev-295a231/) (yandex) for his guidance on networking
			
 
				-  and communication technologies,
			
 
				-* [Lidi Zheng](https://github.com/lidizheng) (google) and grpc-aio contributors for their awesome framework and [this pr](https://github.com/grpc/grpc/pull/23265)
			
 
				-* [Brian Muller](https://github.com/bmuller/kademlia) (parallel markets) for his implementations of [kademlia](https://github.com/bmuller/kademlia) and [rpcudp](https://github.com/bmuller/rpcudp)  
			
 
				-* Alexander Sherbakov (itsoft) for helpful discussions on PC and server component architecture,
			
 
				-* Our early adopters, [contributors](https://github.com/learning-at-home/hivemind/graphs/contributors), and reviewers
			
 
				 
			
 
				+* [Artem Babenko](https://research.yandex.com/people/102794) and
			
 
				+  [Vladimir Aliev](https://ru.linkedin.com/in/vladimir-aliev-19b93282) for helpful discussions and editorial review of
			
 
				+  the paper,
			
 
				+* [Jacob R. Steeves](https://github.com/unconst) for discussions on RPC frameworks and NAT traversal and peer-to-peer
			
 
				+  technologies.
			
 
				+* [Dmitry Afanasiev](https://www.linkedin.com/in/dmitry-afanasiev-295a231/) for his guidance on networking and
			
 
				+  communication technologies,
			
 
				+* [Lidi Zheng](https://github.com/lidizheng) and grpc-aio contributors for their awesome framework
			
 
				+  and [this PR](https://github.com/grpc/grpc/pull/23265)
			
 
				+* [Brian Muller](https://github.com/bmuller/kademlia) for his implementations
			
 
				+  of [kademlia](https://github.com/bmuller/kademlia) and [rpcudp](https://github.com/bmuller/rpcudp)
			
 
				+* Alexander Sherbakov for helpful discussions on PC and server component architecture,
			
 
				+* Our early adopters, [contributors](https://github.com/learning-at-home/hivemind/graphs/contributors), and reviewers
			
 
				 
			
 
				-### Related projects
			
 
				+# Related projects
			
 
				 
			
 
				 We also want to reference several projects that have similar ideas in mind:
			
 
				 
			
 
				-* [BitTensor](https://github.com/opentensor/BitTensor) - a decentralized deep learning ecosystem with incentive
			
 
				- mechanism. Like hivemind, but peers are getting rewarded for their contribution to other peers.
			
 
				-  _(note: as of 26.08.2020 the project is in the early stages development)_.
			
 
				-* [GShard](https://arxiv.org/abs/2006.16668) - a paper by Dmitry Lepikhin et al. that demonstrate the effectiveness
			
 
				-  of huge Mixture-of-Experts models on conventional hpc hardware. Those guys train models 4 times the size of GPT-3 on thousands of TPUv3.
			
 
				+* [BitTensor](https://github.com/opentensor/BitTensor) — a decentralized deep learning ecosystem with incentive
			
 
				+  mechanism. Like hivemind, but peers are getting rewarded for their contribution to other peers. .
			
 
				+* [GShard](https://arxiv.org/abs/2006.16668) — a paper by Dmitry Lepikhin et al. that demonstrate the effectiveness of
			
 
				+  huge Mixture-of-Experts models on conventional hpc hardware. Those guys train models 4 times the size of GPT-3 on
			
 
				+  thousands of TPUv3.
			
 
				 * Also doing research in decentralized deep learning? Let us know!
			
--- a/docs/user/benchmarks.md
+++ b/docs/user/benchmarks.md
@@ -0,0 +1,92 @@
 
				+# Benchmarking
			
 
				+
			
 
				+This page describes the benchmark scripts that can be used to measure the performance impact of different changes to
			
 
				+hivemind.
			
 
				+
			
 
				+### Server throughput
			
 
				+
			
 
				+You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py) to
			
 
				+check the performance impact of your changes to hivemind.client and server. The benchmark will start one server without
			
 
				+DHT with several experts, and then spawn trainer processes that load the server with requests. The two main statistics
			
 
				+in this benchmark samples/s and startup time.
			
 
				+
			
 
				+`python benchmark_throughput.py --preset default` (aka `ffn_forward_backward`)
			
 
				+
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+
			
 
				+  ```sh
			
 
				+Benchmark finished, status:Success
			
 
				+Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				+Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=True
			
 
				+Results: 
			
 
				+	Server startup took 10.965 s. (3.075 s. experts + 7.889 s. networking)
			
 
				+	Processed 4194304 examples in 146.750
			
 
				+	Throughput for forward + backward passes: 28581.213 samples / s.
			
 
				+	Benchmarking took 157.948 s.
			
 
				+Using device: cuda
			
 
				+GeForce GTX 1080 Ti
			
 
				+Memory Usage:
			
 
				+Allocated: 6.0 GB
			
 
				+Cached:    7.7 GB
			
 
				+
			
 
				+  ```
			
 
				+
			
 
				+</details>
			
 
				+
			
 
				+`python benchmark_throughput.py --preset ffn_forward`
			
 
				+
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+
			
 
				+  ```sh
			
 
				+Benchmark finished, status:Success
			
 
				+Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				+Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=False
			
 
				+Results: 
			
 
				+	Server startup took 19.941 s. (3.065 s. experts + 16.877 s. networking)
			
 
				+	Processed 4194304 examples in 42.973
			
 
				+	Throughput for forward passes: 97604.282 samples / s.
			
 
				+	Benchmarking took 63.167 s.
			
 
				+Using device: cuda
			
 
				+GeForce GTX 1080 Ti
			
 
				+Memory Usage:
			
 
				+Allocated: 1.5 GB
			
 
				+Cached:    3.2 GB
			
 
				+```
			
 
				+
			
 
				+</details>
			
 
				+
			
 
				+### DHT performance
			
 
				+
			
 
				+In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_dht.py) can be used
			
 
				+to measure performance impact of changes to hivemind.dht. It spawns a DHT with `num_peers` participants, then chooses
			
 
				+one peer that will declare `num_experts` total experts in batches of `expert_batch_size`. Then, another peer will
			
 
				+consecutively get all peers and check if they are there.
			
 
				+
			
 
				+Here's a run with 1024 participants on the same machine that was used for benchmark_throughput:
			
 
				+
			
 
				+`python benchmark_dht.py --num_peers 1024 --num_experts 16384 --expert_batch_size 64 --expiration 99999 --increase_file_limit`
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+
			
 
				+  ```sh
			
 
				+Increasing file limit - soft 1024=>32768, hard 1048576=>32768
			
 
				+Creating peers...
			
 
				+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [01:45<00:00,  9.74it/s]
			
 
				+Sampled 16384 unique ids (after deduplication)
			
 
				+Storing peers to dht in batches of 64...
			
 
				+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [12:07<00:00,  2.84s/it]
			
 
				+Store success rate: 100.0% (48920 / 48920)
			
 
				+Mean store time: 0.01487, Total: 727.46
			
 
				+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [01:48<00:00,  2.35it/s]
			
 
				+Get success rate: 100.0 (16384 / 16384)
			
 
				+Mean get time: 0.00664, Total: 108.73952
			
 
				+Node survival rate: 100.000%
			
 
				+  ```
			
 
				+
			
 
				+</details>
			
 
				+
			
 
				+The three main statistics in this benchmark are total store time, total get time and get success rate. Please also note
			
 
				+that this benchmark does not emulate node failure, latency and does not benefit from caching. If one wants to account
			
 
				+for these factors, one must introduce them manually by changing the code.
			
--- a/docs/user/contributing.md
+++ b/docs/user/contributing.md
@@ -1,132 +1,28 @@
 
				-## Developer zone
			
 
				+# Contributing to hivemind
			
 
				 
			
 
				-#### Collaborating best practices:
			
 
				-Hivemind is still in the early stage of development, we expect only a handful of collaborators with individual roles.
			
 
				+This section describes the ways to contribute to the hivemind library. For technical details of developing this library
			
 
				+and getting towards merging your code in the master branch, read
			
 
				+the [guidelines](https://github.com/learning-at-home/hivemind/blob/master/CONTRIBUTING.md) in our GitHub repository. In
			
 
				+any case, please follow the [Contributor Covenant](https://www.contributor-covenant.org/version/2/0/code_of_conduct/)
			
 
				+code of conduct when discussing the library and the changes with other community members.
			
 
				 
			
 
				-1. Before you write any code, please contact us to avoid duplicate work:
			
 
				-   * Report bugs and propose new features via issues. We don't have strict templates at this point;
			
 
				-   * If you decide to implement a feature or fix a bug, first leave a comment in the appropriate issue or create a
			
 
				-    new one;
			
 
				-   * Please follow [Contributor Convent v2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct/).
			
 
				-2. When you code, follow the best practices:
			
 
				-   * The code must follow [PEP8](https://www.python.org/dev/peps/pep-0008/) unless absolutely necessary.
			
 
				-     We recommend pycharm IDE;
			
 
				-   * All user-facing interfaces must be documented with docstrings and/or sphinx;
			
 
				-   * We highly encourage the use of [typing](https://docs.python.org/3/library/typing.html), where applicable;
			
 
				-3. After you write the code, make sure others can use it:
			
 
				-   * Any function exposed to a user must have a docstring compatible with [readthedocs](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html);
			
 
				-   * For new features, please write test(s) to make sure your functionality won't be broken by subsequent changes;
			
 
				-   * If you face any challenges or want feedback, please submit a [draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/) pull request.
			
 
				+## Ways to contribute
			
 
				 
			
 
				+### Reporting issues
			
 
				 
			
 
				-#### Developer quickstart
			
 
				+### Proposing new features
			
 
				 
			
 
				-First, install hivemind in the development mode, preferably with python 3.8 on linux/mac_OS.
			
 
				-```
			
 
				-git clone https://github.com/learning-at-home/hivemind
			
 
				-cd hivemind
			
 
				-pip install -e .
			
 
				-``` 
			
 
				+### Implementing new features
			
 
				 
			
 
				-To run tests, you will also need to `pip install -e .[dev]`.
			
 
				-You can run all tests with `pytest ./tests` or choose a specific set, e.g. `pytest ./tests/test_dht.py`.
			
 
				+### Fixing bugs and improving performance
			
 
				 
			
 
				+### Improving tests
			
 
				 
			
 
				-To build docs locally,
			
 
				-1. `pip install -e .[docs]`
			
 
				-2. make sure you ran setup.py (see above)
			
 
				-3. `cd ./docs && make html`
			
 
				+### Improving code readability
			
 
				 
			
 
				-The documentation root will be available in `./docs/_build/html/index.html`
			
 
				+### Adding tutorials
			
 
				 
			
 
				+### Improving documentation
			
 
				 
			
 
				-#### Benchmark throughput
			
 
				-You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py) to check the performance impact of your changes to hivemind.client and server.
			
 
				-The benchmark will start one server without dht with several experts, and then spawn trainer processes that bombard the server with requests.
			
 
				-The two main statistics in this benchmark samples/s and startup time. 
			
 
				+### Reviewing pull requests
			
 
				 
			
 
				-`python benchmark_throughput.py --preset default` (aka `ffn_forward_backward`)
			
 
				-
			
 
				-<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				-  <summary>Console outputs</summary>
			
 
				-  
			
 
				-  ```sh
			
 
				-Benchmark finished, status:Success
			
 
				-Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				-Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=True
			
 
				-Results: 
			
 
				-	Server startup took 10.965 s. (3.075 s. experts + 7.889 s. networking)
			
 
				-	Processed 4194304 examples in 146.750
			
 
				-	Throughput for forward + backward passes: 28581.213 samples / s.
			
 
				-	Benchmarking took 157.948 s.
			
 
				-Using device: cuda
			
 
				-GeForce GTX 1080 Ti
			
 
				-Memory Usage:
			
 
				-Allocated: 6.0 GB
			
 
				-Cached:    7.7 GB
			
 
				-
			
 
				-  ```
			
 
				-</details>
			
 
				-
			
 
				-`python benchmark_throughput.py --preset ffn_forward`
			
 
				-
			
 
				-<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				-  <summary>Console outputs</summary>
			
 
				-  
			
 
				-  ```sh
			
 
				-Benchmark finished, status:Success
			
 
				-Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				-Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=False
			
 
				-Results: 
			
 
				-	Server startup took 19.941 s. (3.065 s. experts + 16.877 s. networking)
			
 
				-	Processed 4194304 examples in 42.973
			
 
				-	Throughput for forward passes: 97604.282 samples / s.
			
 
				-	Benchmarking took 63.167 s.
			
 
				-Using device: cuda
			
 
				-GeForce GTX 1080 Ti
			
 
				-Memory Usage:
			
 
				-Allocated: 1.5 GB
			
 
				-Cached:    3.2 GB
			
 
				-```
			
 
				-
			
 
				-All tests were performed on a single machine with ubuntu server 18.04 x64, msi 1080ti turbo, xeon gold 6149, 
			
 
				- 384Gb LRDIMM (6x64G), python3.8, torch1.6.0 (pip-installed), grpcio 1.31.0 , 
			
 
				- the results have around +-5% fluctuation between consecutive runs. 
			
 
				-
			
 
				-#### Benchmark DHT
			
 
				-In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_dht.py) can be used
			
 
				-to measure performance impact of changes to hivemind.dht. It spawns a DHT with `num_peers` participants, 
			
 
				-then chooses one peer that will declare `num_experts` total experts in batches of `expert_batch_size`.
			
 
				-Then, another peer will consecutively get all peers and check if they are there.
			
 
				-
			
 
				-Here's a run with 1024 participants on the same machine that was used for benchmark_throughput:
			
 
				-
			
 
				-`python benchmark_dht.py --num_peers 1024 --num_experts 16384 --expert_batch_size 64 --expiration 99999 --increase_file_limit`
			
 
				-<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				-  <summary>Console outputs</summary>
			
 
				-  
			
 
				-  ```sh
			
 
				-Increasing file limit - soft 1024=>32768, hard 1048576=>32768
			
 
				-Creating peers...
			
 
				-100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [01:45<00:00,  9.74it/s]
			
 
				-Sampled 16384 unique ids (after deduplication)
			
 
				-Storing peers to dht in batches of 64...
			
 
				-100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [12:07<00:00,  2.84s/it]
			
 
				-Store success rate: 100.0% (48920 / 48920)
			
 
				-Mean store time: 0.01487, Total: 727.46
			
 
				-100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [01:48<00:00,  2.35it/s]
			
 
				-Get success rate: 100.0 (16384 / 16384)
			
 
				-Mean get time: 0.00664, Total: 108.73952
			
 
				-Node survival rate: 100.000%
			
 
				-  ```
			
 
				-</details>
			
 
				-
			
 
				-The three main statistics in this benchmark are total store time, total get time and get success rate.
			
 
				-Please also note that this benchmark does not emulate node failure, latency and does not benefit from caching.
			
 
				-If one wants to account for these factors, one must introduce them manually by changing the code.
			
 
				-  
			
 
				-
			
 
				-#### Tips & tricks
			
 
				-* You can find a wealth of pytorch debugging tricks at [their contributing page](https://tinyurl.com/pytorch-contributing).
			
 
				-* Hivemind is optimized for development in pycharm CE 2019.3 or newer.
			
 
				-  * When working on tests, please mark "tests" as sources root.
			
--- a/docs/user/quickstart.md
+++ b/docs/user/quickstart.md
@@ -1,13 +1,13 @@
 
				-# Quickstart
			
 
				+# Quick Start
			
 
				 
			
 
				 This tutorial will teach you how to install `hivemind`, host your own experts and train them remotely.
			
 
				 
			
 
				+## Installation
			
 
				 
			
 
				-#### Installation
			
 
				+Just `pip install hivemind` to get the latest release.
			
 
				 
			
 
				-Just `pip install hivemind` to get the latest release. 
			
 
				+You can also install the bleeding edge version from GitHub:
			
 
				 
			
 
				-You can also install the bleeding edge version from github:
			
 
				 ```
			
 
				 git clone https://github.com/learning-at-home/hivemind
			
 
				 cd hivemind
			
@@ -16,26 +16,29 @@ pip install .
 
				 
			
 
				 You can also install it in the editable mode with `pip install -e .`.
			
 
				 
			
 
				-* __Dependencies:__ Hivemind requires python 3.7+ (3.8 is recommended), it will install [requirements](https://github.com/learning-at-home/hivemind/blob/master/requirements.txt) automatically; 
			
 
				-* __OS support:__ Linux and macOS should [just work](https://github.com/learning-at-home/hivemind/issues).
			
 
				-We do not officially support Windows, but you are welcome to contribute your windows build :)
			
 
				+* __Dependencies:__ Hivemind requires Python 3.8+.
			
 
				+  The [requirements](https://github.com/learning-at-home/hivemind/blob/master/requirements.txt) are installed
			
 
				+  automatically.
			
 
				+* __OS support:__ Linux and macOS should just work. We do not officially support Windows, but you are welcome to
			
 
				+  contribute your windows build :)
			
 
				 
			
 
				+## Host a server
			
 
				 
			
 
				-#### Host a server
			
 
				+`hivemind.Server` hosts one or several experts (PyTorch modules) for remote access. These experts are responsible for
			
 
				+most of the model parameters and computation. The server can be started using either Python or
			
 
				+[a shell script](https://github.com/learning-at-home/hivemind/blob/master/scripts/run_server.py). We'll use the shell
			
 
				+for now. To host a server with default experts, run this in your shell:
			
 
				 
			
 
				-Hivemind.Server hosts one or several experts (torch modules) for remote access. These experts are responsible for 
			
 
				-most of the model parameters and computation. The server can be started using either python or 
			
 
				-[a shell script](https://github.com/learning-at-home/hivemind/blob/master/scripts/run_server.py). We'll use the shell for now. 
			
 
				-To host a server with default experts, run this in your shell:
			
 
				 ```sh
			
 
				 python scripts/run_server.py --expert_cls ffn --hidden_dim 512 --num_experts 5 --expert_pattern expert.[0:5] \
			
 
				                              --listen_on 0.0.0.0:1337 --dht_port 1338
			
 
				 # note: if you omit listen_on and/or dht_port, they will be chosen automatically and printed to stdout.
			
 
				 ```
			
 
				+
			
 
				 <details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				   <summary><i>Console outputs</i></summary>
			
 
				-  
			
 
				-  ```sh
			
 
				+
			
 
				+```sh
			
 
				 [2020/08/26 11:54:52.645][INFO][server.create:101] Bootstrapping DHT node, initial peers = []
			
 
				 [2020/08/26 11:54:52.660][INFO][server.create:105] Running dht node on port 1338
			
 
				 [2020/08/26 11:54:53.182][INFO][server.task_pool.run:130] expert.0_forward starting, pid=19382
			
@@ -62,23 +65,28 @@ python scripts/run_server.py --expert_cls ffn --hidden_dim 512 --num_experts 5 -
 
				 [2020/08/26 11:54:53.244][INFO][server.runtime.run:60] Started
			
 
				 [2020/08/26 11:54:53.245][INFO][server.create:136] Server started at 0.0.0.0:1337
			
 
				 [2020/08/26 11:54:53.245][INFO][server.create:137] Got 5 active experts of type ffn: ['expert.0', 'expert.1', 'expert.2', 'expert.3', 'expert.4']
			
 
				-  ```
			
 
				+```
			
 
				+
			
 
				 </details>
			
 
				 
			
 
				 
			
 
				-This server accepts requests to experts on port 1337 and start a DHT peer on port 1338.
			
 
				-In total, it serves 5 feedforward experts with ReLU and LayerNorm
			
 
				- (see architecture [here](https://github.com/learning-at-home/hivemind/blob/master/hivemind/server/layers/__init__.py#L7-L21)).
			
 
				+This server accepts requests to experts on port 1337 and start a DHT peer on port 1338. In total, it serves 5
			
 
				+feedforward experts with ReLU and LayerNorm
			
 
				+(see
			
 
				+architecture [here](https://github.com/learning-at-home/hivemind/blob/master/hivemind/server/layers/__init__.py#L7-L21))
			
 
				+.
			
 
				 
			
 
				 You can create additional servers in the same decentralized network using `--initial_peers` argument:
			
 
				+
			
 
				 ```sh
			
 
				 python scripts/run_server.py --expert_cls ffn --hidden_dim 512 --num_experts 10 --expert_pattern "expert.[5:250]" \
			
 
				                               --initial_peers localhost:1338
			
 
				 ```
			
 
				+
			
 
				 <details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				   <summary>Console outputs</summary>
			
 
				-  
			
 
				-  ```sh
			
 
				+
			
 
				+```sh
			
 
				 [2020/08/26 13:15:05.078][INFO][server.create:103] Bootstrapping DHT node, initial peers = ['localhost:1338']
			
 
				 [2020/08/26 13:15:05.101][INFO][server.create:107] Running dht node on port 44291
			
 
				 expert.[5:250]
			
@@ -127,15 +135,17 @@ expert.[5:250]
 
				 [2020/08/26 13:15:06.510][INFO][server.create:166] Server started at 0.0.0.0:40089
			
 
				 [2020/08/26 13:15:06.510][INFO][server.create:167] Got 10 active experts of type ffn: ['expert.113', 'expert.149', 'expert.185', 'expert.189', 'expert.191', 'expert.196', 'expert.225', 'expert.227', 'expert.36', 'expert.58']
			
 
				 ```
			
 
				+
			
 
				 </details>
			
 
				 
			
 
				-Here and below, if you are running on a different machine, replace `localhost:1338` with your original server's
			
 
				-public IP address (e.g. `12.34.56.78:1338`). Hivemind supports both ipv4 and ipv6 protocols and uses the same notation
			
 
				+Here and below, if you are running on a different machine, replace `localhost:1338` with your original server's public
			
 
				+IP address (e.g. `12.34.56.78:1338`). Hivemind supports both ipv4 and ipv6 protocols and uses the same notation
			
 
				 as [gRPC](https://grpc.io/docs/languages/python/basics/#starting-the-server).
			
 
				 
			
 
				-#### Run the experts
			
 
				+## Train the experts
			
 
				+
			
 
				+Now let's put these experts to work. Create a python console (or a jupyter) and run:
			
 
				 
			
 
				-Now let's put these experts to work. Create a python console (or a jupyter) and run: 
			
 
				 ```python
			
 
				 import torch
			
 
				 import hivemind
			
@@ -149,18 +159,20 @@ assert expert1 is not None and expert4 is not None, "server hasn't declared expe
 
				 ```
			
 
				 
			
 
				 The experts (e.g. `expert1`) can be used as a pytorch module with autograd support:
			
 
				+
			
 
				 ```python
			
 
				 dummy = torch.randn(3, 512)
			
 
				 out = expert1(dummy)  # forward pass
			
 
				 out.sum().backward()  # backward pass
			
 
				 ```
			
 
				 
			
 
				-When called, expert1 will submit a request to the corresponding server (which you created above) and return
			
 
				- the output tensor(s) or raise an exception. During backward, pytorch will submit the backward requests
			
 
				- for the experts as they appear in the computation graph.
			
 
				- 
			
 
				-By default, the experts will automatically update their parameters with one step of SGD after each backward pass.
			
 
				-This allows you to quickly run training using both local and remote layers:
			
 
				+When called, expert1 will submit a request to the corresponding server (which you created above) and return the output
			
 
				+tensor(s) or raise an exception. During backward, pytorch will submit the backward requests for the experts as they
			
 
				+appear in the computation graph.
			
 
				+
			
 
				+By default, the experts will automatically update their parameters with one step of SGD after each backward pass. This
			
 
				+allows you to quickly run training using both local and remote layers:
			
 
				+
			
 
				 ```python
			
 
				 # generate dummy data
			
 
				 x = torch.randn(3, 512)
			
@@ -181,9 +193,12 @@ for i in range(100):
 
				     opt.step()
			
 
				 ```
			
 
				 
			
 
				-Finally, you can create a Mixture-of-Experts layer over our humble band of experts:
			
 
				+Finally, you can create a Mixture-of-Experts layer over these experts:
			
 
				+
			
 
				 ```python
			
 
				-import nest_asyncio;  nest_asyncio.apply()  # asyncio patch for jupyter. for now, we recommend using MoE from console
			
 
				+import nest_asyncio
			
 
				+
			
 
				+nest_asyncio.apply()  # asyncio patch for jupyter. for now, we recommend using MoE from console
			
 
				 dmoe = hivemind.RemoteMixtureOfExperts(in_features=512, uid_prefix="expert", grid_size=(5,),
			
 
				                                        dht=dht, k_best=2)
			
 
				 
			
@@ -192,8 +207,8 @@ out.sum().backward()
 
				 ```
			
 
				 
			
 
				 The `dmoe` layer dynamically selects the right experts using a linear gating function. It will then dispatch parallel
			
 
				-forward (and backward) requests to those experts and collect results.
			
 
				-You can find more details on how MoE works in Section 2.3 of the [paper](https://arxiv.org/abs/2002.04013)
			
 
				+forward (and backward) requests to those experts and collect results. You can find more details on how DMoE works in
			
 
				+Section 2.3 of the [paper](https://arxiv.org/abs/2002.04013)
			
 
				 
			
 
				 Congratulations, you've made it through the basic tutorial. Give yourself a pat on the back :)
			
 
				 
			
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
				 from hivemind.server import *
			
 
				 from hivemind.utils import *
			
 
				 
			
 
				-__version__ = '0.8.29'
			
 
				+__version__ = '0.9.0'
			
--- a/setup.py
+++ b/setup.py
@@ -68,7 +68,7 @@ setup(
 
				     long_description='Decentralized deep learning in PyTorch. Built to train giant models on '
			
 
				                      'thousands of volunteers across the world.',
			
 
				     author='Learning@home & contributors',
			
 
				-    author_email='mryabinin@hse.ru',
			
 
				+    author_email='mryabinin0@gmail.com',
			
 
				     url="https://github.com/learning-at-home/hivemind",
			
 
				     packages=find_packages(exclude=['tests']),
			
 
				     package_data={'hivemind': ['proto/*']},
			
@@ -84,6 +84,7 @@ setup(
 
				         'License :: OSI Approved :: MIT License',
			
 
				         'Programming Language :: Python :: 3',
			
 
				         'Programming Language :: Python :: 3.8',
			
 
				+        'Programming Language :: Python :: 3.9',
			
 
				         'Topic :: Scientific/Engineering',
			
 
				         'Topic :: Scientific/Engineering :: Mathematics',
			
 
				         'Topic :: Scientific/Engineering :: Artificial Intelligence',
			
--- a/tests/test_averaging.py
+++ b/tests/test_averaging.py
@@ -185,7 +185,7 @@ def test_partitioning():
 
				         total_size = sum(map(torch.Tensor.numel, tensors))
			
 
				         if total_size == 0:
			
 
				             continue
			
 
				-        num_chunks = random.randint(1, min(1000, sum(x.numel() for x in tensors)))
			
 
				+        num_chunks = random.randint(1, min(100, sum(x.numel() for x in tensors)))
			
 
				         part_sizes = load_balance_peers(total_size, [None] * num_chunks)
			
 
				         chunks = split_into_parts(tensors, part_sizes)
			
 
				         assert len(chunks) == num_chunks
			
@@ -309,9 +309,9 @@ def test_load_state_from_peers():
 
				     assert all(map(torch.allclose, got_tensors, super_tensors))
			
 
				 
			
 
				     # check that normal averaging still works
			
 
				-    futures = [averager.step(wait=False) for averager in [averager1, averager2]]
			
 
				-    for future in futures:
			
 
				-        future.result()
			
 
				+    # futures = [averager.step(wait=False) for averager in [averager1, averager2]]
			
 
				+    # for future in futures:
			
 
				+    #     future.result()
			
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -53,7 +53,7 @@ def test_dht_get_address(addr=LOCALHOST, dummy_endpoint='123.45.67.89:*'):
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=256,
			
 
				+def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=16,
			
 
				                      grid_dims=(32, 32, 32)):
			
 
				     dht = []
			
 
				     for i in range(dht_size):
			
--- a/tests/test_dht_node.py
+++ b/tests/test_dht_node.py
@@ -108,7 +108,6 @@ def test_dht_protocol():
 
				 
			
 
				         if listen:
			
 
				             loop.run_until_complete(protocol.shutdown())
			
 
				-        print("DHTProtocol test finished successfully!")
			
 
				 
			
 
				     peer1_proc.terminate()
			
 
				     peer2_proc.terminate()
			
@@ -193,9 +192,9 @@ def test_dht_node():
 
				     jaccard_numerator = jaccard_denominator = 0  # jaccard similarity aka intersection over union
			
 
				     all_node_ids = list(dht.values())
			
 
				 
			
 
				-    for i in range(100):
			
 
				+    for i in range(10):
			
 
				         query_id = DHTID.generate()
			
 
				-        k_nearest = random.randint(1, 20)
			
 
				+        k_nearest = random.randint(1, 10)
			
 
				         exclude_self = random.random() > 0.5
			
 
				         nearest = loop.run_until_complete(
			
 
				             me.find_nearest_nodes([query_id], k_nearest=k_nearest, exclude_self=exclude_self))[query_id]
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -26,7 +26,7 @@ def test_moe():
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_call_many():
			
 
				+def test_call_many(hidden_dim=16):
			
 
				     k_min = 1
			
 
				     timeout_after_k_min = None
			
 
				     backward_k_min = 1
			
@@ -35,9 +35,9 @@ def test_call_many():
 
				     detect_anomalies = False
			
 
				     atol = 1e-5
			
 
				 
			
 
				-    with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=64,
			
 
				+    with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=hidden_dim,
			
 
				                            optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				-        inputs = torch.randn(4, 64, requires_grad=True)
			
 
				+        inputs = torch.randn(4, hidden_dim, requires_grad=True)
			
 
				         inputs_clone = inputs.clone().detach().requires_grad_(True)
			
 
				         e0, e1, e2, e3, e4 = [hivemind.RemoteExpert(f'expert.{i}', server_endpoint) for i in range(5)]
			
 
				         e5 = hivemind.RemoteExpert(f'thisshouldnotexist', '127.0.0.1:80')
			
@@ -47,7 +47,7 @@ def test_call_many():
 
				             forward_timeout, backward_timeout, detect_anomalies, e1.info, inputs
			
 
				         )
			
 
				         assert mask.shape == (4, 3)
			
 
				-        assert expert_outputs.shape == (4, 3, 64)
			
 
				+        assert expert_outputs.shape == (4, 3, hidden_dim)
			
 
				 
			
 
				         assert np.all(mask.data.numpy() == np.array([[True, True, True],
			
 
				                                                      [True, True, False],
			
@@ -64,7 +64,7 @@ def test_call_many():
 
				         reference_outputs[2, 2] = e3(inputs_clone[2:3])
			
 
				 
			
 
				         assert torch.allclose(expert_outputs, reference_outputs, atol=atol, rtol=0)
			
 
				-        proj = torch.randn(4, 64)
			
 
				+        proj = torch.randn(4, hidden_dim)
			
 
				         loss = (expert_outputs[(0, 1, 1, 2), (0, 2, 1, 0)] * proj).sum()
			
 
				         loss.backward()
			
 
				         our_grad = inputs.grad.data.cpu().clone()
			
@@ -76,17 +76,17 @@ def test_call_many():
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_remote_module_call():
			
 
				-    with background_server(num_experts=1, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=1024,
			
 
				+def test_remote_module_call(hidden_dim=16):
			
 
				+    with background_server(num_experts=1, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=hidden_dim,
			
 
				                            optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				         real_expert = hivemind.RemoteExpert('expert.0', server_endpoint)
			
 
				         fake_expert = hivemind.RemoteExpert('oiasfjiasjf', server_endpoint)
			
 
				 
			
 
				-        out1 = real_expert(torch.randn(1, 1024))
			
 
				-        assert out1.shape == (1, 1024)
			
 
				-        dummy_x = torch.randn(3, 1024, requires_grad=True)
			
 
				+        out1 = real_expert(torch.randn(1, hidden_dim))
			
 
				+        assert out1.shape == (1, hidden_dim)
			
 
				+        dummy_x = torch.randn(3, hidden_dim, requires_grad=True)
			
 
				         out3 = real_expert(dummy_x)
			
 
				-        assert out3.shape == (3, 1024)
			
 
				+        assert out3.shape == (3, hidden_dim)
			
 
				         out3_again = real_expert(dummy_x[1:])
			
 
				         assert torch.allclose(out3_again, out3[1:], atol=1e-5, rtol=0)
			
 
				         out3_again.norm().backward()
			
@@ -126,13 +126,13 @@ def test_beam_search_correctness():
 
				 
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				-def test_determinism():
			
 
				+def test_determinism(hidden_dim=16):
			
 
				     atol = 1e-5
			
 
				 
			
 
				-    xx = torch.randn(32, 1024, requires_grad=True)
			
 
				-    mask = torch.randint(0, 1, (32, 1024))
			
 
				+    xx = torch.randn(32, hidden_dim, requires_grad=True)
			
 
				+    mask = torch.randint(0, 1, (32, hidden_dim))
			
 
				 
			
 
				-    with background_server(num_experts=1, device='cpu', expert_cls='det_dropout', num_handlers=1,
			
 
				+    with background_server(num_experts=1, device='cpu', expert_cls='det_dropout', num_handlers=1, hidden_dim=hidden_dim,
			
 
				                            optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				         expert = hivemind.RemoteExpert(uid=f'expert.0', endpoint=server_endpoint)
			
 
				 
			
@@ -151,7 +151,7 @@ def test_compute_expert_scores():
 
				     try:
			
 
				         dht = hivemind.DHT(start=True)
			
 
				         moe = hivemind.client.moe.RemoteMixtureOfExperts(
			
 
				-            dht=dht, in_features=1024, grid_size=(40,), k_best=4, k_min=1, timeout_after_k_min=1,
			
 
				+            dht=dht, in_features=16, grid_size=(40,), k_best=4, k_min=1, timeout_after_k_min=1,
			
 
				             uid_prefix='expert.')
			
 
				         gx, gy = torch.randn(4, 5, requires_grad=True), torch.randn(4, 3, requires_grad=True)
			
 
				         ii = [[4, 0, 2], [3, 1, 1, 1, 3], [0], [3, 2]]