5 ani în urmă · 4edec82bb9
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,24 +0,0 @@
 
				-
			
 
				-
			
 
				-### Rules for collaborating:
			
 
				-Hivemind is still in the early stage of development, we expect only a handful of collaborators with individual roles.
			
 
				-
			
 
				-1. Before you write any code, please contact us to avoid duplicate work:
			
 
				-   * Report bugs and propose new features via issues. We don't have templates at this point;
			
 
				-   * If you decide to implement a feature or fix a bug, leave a comment in the appropriate issue or create a new one;
			
 
				-   * Please follow [Contributor Convent v2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct/).
			
 
				-2. When you code, follow the best practices:
			
 
				-   * We use [GitFlow](https://datasift.github.io/gitflow/IntroducingGitFlow.html)-style development;
			
 
				-   * The code itself must follow [PEP8](https://www.python.org/dev/peps/pep-0008/). We recommend using pycharm builtin linter;
			
 
				-   * We highly encourage the use of typing, where applicable; If not applicable, use other tools like docstrings;
			
 
				-3. After you write the code, make sure others can use it:
			
 
				-   * Any function exposed to a user must have a docstring compatible with [sphinx](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html);
			
 
				-   * For new features, please write test(s) to make sure your functionality won't be broken by subsequent changes;
			
 
				-   * If you face any challenges or want feedback, please submit pull request early with a [WIP] tag = work in progress.
			
 
				-
			
 
				-
			
 
				-
			
 
				-### Tips & tricks
			
 
				-* You can find a wealth of pytorch debugging tricks at [their contributing page](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
			
 
				-* Hivemind is optimized for development in pycharm CE 2019.3 or newer.
			
 
				-  * When working on tests, please mark "tests" as sources root.
			
--- a/README.md
+++ b/README.md
@@ -1,12 +1,16 @@
 
				 ## Hivemind
			
 
				-
			
 
				 [![Build status](https://circleci.com/gh/learning-at-home/hivemind.svg?style=shield)](https://circleci.com/gh/learning-at-home/hivemind)
			
 
				 [![Documentation Status](https://readthedocs.org/projects/learning-at-home/badge/?version=latest)](https://learning-at-home.readthedocs.io/en/latest/?badge=latest)
			
 
				 
			
 
				-Distributed training of large neural networks across volunteer computers.
			
 
				+A library to train large neural networks across the internet. Imagine training one huge transformer
			
 
				+  on thousands of computers from universities, companies, and volunteers.
			
 
				 
			
 
				 ![img](https://i.imgur.com/GPxolxb.gif)
			
 
				 
			
 
				-**[WIP]** - this branch is a work in progress. If you're interested in
			
 
				-supplementary code for [Learning@home paper](https://arxiv.org/abs/2002.04013),
			
 
				-you can find it at https://github.com/mryab/learning-at-home.
			
 
				+##### Links:
			
 
				+ * What is hivemind all about? Here's a [3-minute read](https://learning-at-home.github.io) or a [full paper](https://arxiv.org/abs/2002.04013)
			
 
				+ * [Quickstart tutorial](https://learning-at-home.readthedocs.io/en/latest/user/quickstart.html) - install hivemind, 
			
 
				+    set up a server and train experts  
			
 
				+ * Documentation & guides: [learning-at-home.readthedocs.io](https://learning-at-home.readthedocs.io)
			
 
				+ * [Contributor's guide](https://learning-at-home.readthedocs.io/en/latest/user/contributing.html): best practices, tests and performance benchmarks
			
 
				+ * [Related](https://learning-at-home.readthedocs.io/en/latest/user/acknowledgements.html) projects and acknowledgements
			
--- a/docs/_static/favicon.png
+++ b/docs/_static/favicon.png
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -97,7 +97,11 @@ html_theme = 'sphinx_rtd_theme'
 
				 # further.  For a list of options available for each theme, see the
			
 
				 # documentation.
			
 
				 #
			
 
				-# html_theme_options = {}
			
 
				+html_theme_options = {
			
 
				+    "collapse_navigation": False
			
 
				+}
			
 
				+
			
 
				+html_favicon = '_static/favicon.png'
			
 
				 
			
 
				 # Add any paths that contain custom static files (such as style sheets) here,
			
 
				 # relative to this directory. They are copied after the builtin static files,
			
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,30 +1,31 @@
 
				-``learning@home::hivemind``
			
 
				+|logo| **Hivemind docs & tutorials**
			
 
				 ====================================
			
 
				 
			
 
				-Hivemind lets you train huge neural networks on computers provided by volunteers. Powered by pytorch.
			
 
				+ .. |logo| image:: _static/favicon.png
			
 
				+  :scale: 48
			
 
				 
			
 
				-.. image:: _static/bug.gif
			
 
				+This is a library for decentralized deep learning. It allows you to train large neural networks using vast numbers
			
 
				+of unreliable computers.
			
 
				+Learn how to create or join a hivemind run in the `quickstart tutorial <./user/quickstart.html>`__ or browse the API
			
 
				+documentation below.
			
 
				 
			
 
				-User guide:
			
 
				+| Hivemind is currently in active development, so expect some adventures. If you encounter any issues, please let us know
			
 
				+  `on github <https://github.com/learning-at-home/hivemind/issues>`__.
			
 
				 
			
 
				-.. toctree::
			
 
				-  :maxdepth: 2
			
 
				-
			
 
				-  user/quickstart.md
			
 
				-
			
 
				-
			
 
				-API documentation:
			
 
				 
			
 
				+**Table of contents:**
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				 .. toctree::
			
 
				   :maxdepth: 2
			
 
				+  :glob:
			
 
				 
			
 
				-  modules/client.rst
			
 
				-  modules/server.rst
			
 
				-  modules/dht.rst
			
 
				+  user/quickstart
			
 
				+  modules/index
			
 
				+  user/contributing
			
 
				+  user/acknowledgements
			
 
				 
			
 
				 Indices and tables
			
 
				-==================
			
 
				-
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				 * :ref:`genindex`
			
 
				 * :ref:`modindex`
			
 
				 * :ref:`search`
			
--- a/docs/modules/client.rst
+++ b/docs/modules/client.rst
@@ -1,4 +1,4 @@
 
				-``hidemind.client``
			
 
				+**hivemind.client**
			
 
				 ====================
			
 
				 
			
 
				 .. automodule:: hivemind.client
			
--- a/docs/modules/dht.rst
+++ b/docs/modules/dht.rst
@@ -1,4 +1,4 @@
 
				-**Hivemind DHT**
			
 
				+**hivemind.dht**
			
 
				 ====================
			
 
				 
			
 
				 .. automodule:: hivemind.dht
			
--- a/docs/modules/index.rst
+++ b/docs/modules/index.rst
@@ -0,0 +1,10 @@
 
				+####################
			
 
				+  API documentation
			
 
				+####################
			
 
				+
			
 
				+.. toctree::
			
 
				+   :maxdepth: 2
			
 
				+
			
 
				+   client
			
 
				+   server
			
 
				+   dht
			
--- a/docs/modules/server.rst
+++ b/docs/modules/server.rst
@@ -1,22 +1,41 @@
 
				-**Hivemind Server**
			
 
				+**hivemind.server**
			
 
				 ========================================
			
 
				 
			
 
				+A hivemind server hosts one or several experts and processes incoming requests to those experts. It periodically
			
 
				+re-publishes these experts to the dht via a dedicated **hivemind.dht.DHT** peer that runs in background.
			
 
				+The experts can be accessed directly as **hivemind.client.RemoteExpert("addr:port", "expert.uid.here")**
			
 
				+or as a part of **hivemind.client.RemoteMixtureOfExperts** that finds the most suitable experts across the DHT.
			
 
				+
			
 
				+The hivemind.server module is organized as follows:
			
 
				+
			
 
				+- Server_ is the main class that publishes experts, accepts incoming requests, and passes them to Runtime_ for compute.
			
 
				+- Runtime_ balances the device (GPU) usage between several ExpertBackend_ instances that each service one expert.
			
 
				+- ExpertBackend_ is a wrapper for `torch.nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_ \
			
 
				+  that can be accessed by remote clients. It has two TaskPool_ -s for forward and backward requests.
			
 
				+- TaskPool_ stores incoming requests for a batch-parallel computation (e.g. forward pass), groups them into batches \
			
 
				+  and offers those batches to Runtime_ for processing.
			
 
				+
			
 
				+
			
 
				 .. automodule:: hivemind.server
			
 
				 
			
 
				 .. currentmodule:: hivemind.server
			
 
				 
			
 
				+.. _Server:
			
 
				 .. autoclass:: Server
			
 
				    :members:
			
 
				    :member-order: bysource
			
 
				 
			
 
				+.. _Runtime:
			
 
				 .. autoclass:: Runtime
			
 
				     :members:
			
 
				     :member-order: bysource
			
 
				 
			
 
				+.. _ExpertBackend:
			
 
				 .. autoclass:: ExpertBackend
			
 
				     :members: forward, backward, apply_gradients, get_info, get_pools
			
 
				     :member-order: bysource
			
 
				 
			
 
				+.. _TaskPool:
			
 
				 .. autoclass:: TaskPool
			
 
				     :members: submit_task, iterate_minibatches, load_batch_to_runtime, send_outputs_from_runtime, get_task_size, empty
			
 
				     :member-order: bysource
			
--- a/docs/user/acknowledgements.md
+++ b/docs/user/acknowledgements.md
@@ -0,0 +1,26 @@
 
				+## Credits
			
 
				+
			
 
				+We kindly thank (in random order)
			
 
				+* [Artem Babenko](https://research.yandex.com/people/102794) and 
			
 
				+  [Vladimir Aliev](https://ru.linkedin.com/in/vladimir-aliev-19b93282) (yandex research) for helpful discussions
			
 
				+  and editorial review of the paper,
			
 
				+* [Jacob R. Steeves](https://github.com/unconst) (bittensor) for discussions on RPC frameworks and NAT traversal and 
			
 
				+  peer-to-peer technologies. 
			
 
				+* [Dmitry Afanasiev](https://www.linkedin.com/in/dmitry-afanasiev-295a231/) (yandex) for his guidance on networking
			
 
				+  and communication technologies,
			
 
				+* [Lidi Zheng](https://github.com/lidizheng) (google) and grpc-aio contributors for their awesome framework and [this pr](https://github.com/grpc/grpc/pull/23265)
			
 
				+* [Brian Muller](https://github.com/bmuller/kademlia) (parallel markets) for his implementations of [kademlia](https://github.com/bmuller/kademlia) and [rpcudp](https://github.com/bmuller/rpcudp)  
			
 
				+* Alexander Sherbakov (itsoft) for helpful discussions on PC and server component architecture,
			
 
				+* Our early adopters, [contributors](https://github.com/learning-at-home/hivemind/graphs/contributors), and reviewers
			
 
				+
			
 
				+
			
 
				+### Related projects
			
 
				+
			
 
				+We also want to reference several projects that have similar ideas in mind:
			
 
				+
			
 
				+* [BitTensor](https://github.com/opentensor/BitTensor) - a decentralized deep learning ecosystem with with incentive
			
 
				+ mechanism. Like hivemind, but peers are getting rewarded for their contribution to other peers.
			
 
				+  _(note: as of 26.08.2020 the project is in the early stages development)_.
			
 
				+* [GShard](https://arxiv.org/abs/2006.16668) - a paper by Dmitry Lepikhin et al that demonstrate the effectiveness
			
 
				+  of huge Mixture-of-Experts models on conventional hpc hardware. Those guys train models 4 times the size of GPT-3 on thousands of TPUv3.
			
 
				+* Also doing research in decentralized deep learning? Let us know!
			
--- a/docs/user/contributing.md
+++ b/docs/user/contributing.md
@@ -0,0 +1,130 @@
 
				+## Contributing
			
 
				+
			
 
				+#### Collaborating best practices:
			
 
				+Hivemind is still in the early stage of development, we expect only a handful of collaborators with individual roles.
			
 
				+
			
 
				+1. Before you write any code, please contact us to avoid duplicate work:
			
 
				+   * Report bugs and propose new features via issues. We don't have strict templates at this point;
			
 
				+   * If you decide to implement a feature or fix a bug, first leave a comment in the appropriate issue or create a
			
 
				+    new one;
			
 
				+   * Please follow [Contributor Convent v2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct/).
			
 
				+2. When you code, follow the best practices:
			
 
				+   * The code must follow [PEP8](https://www.python.org/dev/peps/pep-0008/) unless absolutely necessary.
			
 
				+     We recommend pycharm IDE;
			
 
				+   * All user-facing interfaces must be documented with docstrings and/or sphinx;
			
 
				+   * We highly encourage the use of [typing](https://docs.python.org/3/library/typing.html), where applicable;
			
 
				+3. After you write the code, make sure others can use it:
			
 
				+   * Any function exposed to a user must have a docstring compatible with [readthedocs](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html);
			
 
				+   * For new features, please write test(s) to make sure your functionality won't be broken by subsequent changes;
			
 
				+   * If you face any challenges or want feedback, please submit a [draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/) pull request.
			
 
				+
			
 
				+
			
 
				+#### Contributor's manual
			
 
				+
			
 
				+First, install hivemind in the development mode, preferably with python 3.8 on linux/mac_OS.
			
 
				+```
			
 
				+git clone https://github.com/learning-at-home/hivemind
			
 
				+cd hivemind
			
 
				+python setup.py develop
			
 
				+``` 
			
 
				+
			
 
				+To run tests, you will also need to `pip install pytest codecov tqdm scikit-learn`.
			
 
				+You can run all tests with `pytest ./tests` or choose a specific set, e.g. `pytest ./tests/test_dht.py`.
			
 
				+
			
 
				+To build docs locally,
			
 
				+1. `pip install sphinx sphinx_rtd_theme recommonmark`
			
 
				+2. make sure you ran setup.py (see above)
			
 
				+3. `cd ./docs && make html`
			
 
				+
			
 
				+The documentation root will be available in `./docs/_build/html/index.html`
			
 
				+
			
 
				+
			
 
				+#### Benchmark throughput
			
 
				+You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py) to check the performance impact of your changes to hivemind.client and server.
			
 
				+The benchmark will start one server without dht with several experts, and then spawn trainer processes that bombard the server with requests.
			
 
				+The two main statistics in this benchmark samples/s and startup time. 
			
 
				+
			
 
				+`python benchmark_throughput.py --preset default` (aka `ffn_forward_backward`)
			
 
				+
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+  
			
 
				+  ```sh
			
 
				+Benchmark finished, status:Success
			
 
				+Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				+Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=True
			
 
				+Results: 
			
 
				+	Server startup took 10.965 s. (3.075 s. experts + 7.889 s. networking)
			
 
				+	Processed 4194304 examples in 146.750
			
 
				+	Throughput for forward + backward passes: 28581.213 samples / s.
			
 
				+	Benchmarking took 157.948 s.
			
 
				+Using device: cuda
			
 
				+GeForce GTX 1080 Ti
			
 
				+Memory Usage:
			
 
				+Allocated: 6.0 GB
			
 
				+Cached:    7.7 GB
			
 
				+
			
 
				+  ```
			
 
				+</details>
			
 
				+
			
 
				+`python benchmark_throughput.py --preset ffn_forward`
			
 
				+
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+  
			
 
				+  ```sh
			
 
				+Benchmark finished, status:Success
			
 
				+Server parameters: num_experts=16, num_handlers=64, max_batch_size=8192, expert_cls=ffn, hid_dim=1024, device=cuda
			
 
				+Client parameters: num_clients=128, num_batches_per_client=16, batch_size=2048, backprop=False
			
 
				+Results: 
			
 
				+	Server startup took 19.941 s. (3.065 s. experts + 16.877 s. networking)
			
 
				+	Processed 4194304 examples in 42.973
			
 
				+	Throughput for forward passes: 97604.282 samples / s.
			
 
				+	Benchmarking took 63.167 s.
			
 
				+Using device: cuda
			
 
				+GeForce GTX 1080 Ti
			
 
				+Memory Usage:
			
 
				+Allocated: 1.5 GB
			
 
				+Cached:    3.2 GB
			
 
				+```
			
 
				+
			
 
				+All tests were performed on a single machine with ubuntu server 18.04 x64, msi 1080ti turbo, xeon gold 6149, 
			
 
				+ 384Gb LRDIMM (6x64G), python3.8, torch1.6.0 (pip-installed), grpcio 1.31.0 , 
			
 
				+ the results have around +-5% fluctuation between consecutive runs. 
			
 
				+
			
 
				+#### Benchmark DHT
			
 
				+In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_dht.py) can be used
			
 
				+to measure performance impact of changes to hivemind.dht. It spawns a DHT with `num_peers` participants, 
			
 
				+then chooses one peer that will declare `num_experts` total experts in batches of `expert_batch_size`.
			
 
				+Then, another peer will consecutively get all peers and check if they are there.
			
 
				+
			
 
				+Here's a run with 1024 participants on the same machine that was used benchmark_throughput:
			
 
				+
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+  
			
 
				+  ```sh
			
 
				+Increasing file limit - soft 1024=>32768, hard 1048576=>32768
			
 
				+Creating peers...
			
 
				+100%|███████████████████████████████████████████████████| 1024/1024 [01:51<00:00,  9.22it/s]
			
 
				+Sampled 16384 unique ids (after deduplication)
			
 
				+Storing peers to dht in batches of 64...
			
 
				+100%|█████████████████████████████████████████████████████| 256/256 [13:00<00:00,  3.05s/it]
			
 
				+Store success rate: 100.0% (48904 / 48904)
			
 
				+Mean store time: 0.015967, Total: 780.85
			
 
				+100%|█████████████████████████████████████████████████████| 256/256 [02:01<00:00,  2.11it/s]
			
 
				+Get success rate: 100.0 (16383 / 16384)
			
 
				+Mean get time: 0.00740, Total: 121.29011
			
 
				+Node survival rate: 100.000%
			
 
				+  ```
			
 
				+</details>
			
 
				+
			
 
				+The three main statistics in this benchmark are total store time, total get time and get success rate.
			
 
				+Please also note that this benchmark does not emulate node failure, latency and does not benefit from caching.
			
 
				+If one wants to account for these factors, one must introduce them manually by changing the code.
			
 
				+  
			
 
				+
			
 
				+#### Tips & tricks
			
 
				+* You can find a wealth of pytorch debugging tricks at [their contributing page](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
			
 
				+* Hivemind is optimized for development in pycharm CE 2019.3 or newer.
			
 
				+  * When working on tests, please mark "tests" as sources root.
			
--- a/docs/user/quickstart.md
+++ b/docs/user/quickstart.md
@@ -1,57 +1,200 @@
 
				-# Quick start [nothing here yet]
			
 
				-
			
 
				-This will eventually become a tutorial on how to host a hivemind node or connect to an existing node.
			
 
				-
			
 
				-![img](https://media.giphy.com/media/3oz8xtBx06mcZWoNJm/giphy.gif)
			
 
				-
			
 
				-## What do I need to run it?
			
 
				-
			
 
				-- One or several computers, each equipped with at least one GPU
			
 
				-- Each computer should have at least two open ports (if not, consider ssh port
			
 
				-  forwarding)
			
 
				-- Some popular Linux x64 distribution
			
 
				-  - Tested on Ubuntu16.04, should work fine on any popular linux64 and even
			
 
				-    MacOS;
			
 
				-  - Running on Windows natively is not supported, please use vm or docker;
			
 
				-
			
 
				-## How do I run it?
			
 
				-
			
 
				-Currently, there is no way to do it easily. There are some tests (you can check [`./tests/benchmark_throughput.py`](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py)
			
 
				- or look into CI logs) and we want to expand them. If you want to
			
 
				-do something complex with it, please contact us by opening an issue (less preferred: [telegram](https://t.me/justheuristic)).
			
 
				-
			
 
				-## `hivemind` quick tour
			
 
				-
			
 
				-**Trainer process:**
			
 
				-
			
 
				-- **`RemoteExpert`**(`hivemind/client/remote_expert.py`) behaves like a pytorch
			
 
				-  module with autograd support but actually sends request to a remote runtime.
			
 
				-- **`RemoteMixtureOfExperts`**(`hivemind/client/remote_moe.py`) finds best experts
			
 
				-  for a given input and either returns them as `RemoteExpert` or applies them
			
 
				-  right away.
			
 
				-
			
 
				-**Runtime process:**
			
 
				-
			
 
				-- **`Runtime`** (`hivemind/runtime/__init__.py`) aggregates batches
			
 
				-  and performs inference/training of experts according to their priority.
			
 
				-- **`Server`** (`hivemind/server/__init__.py`) wraps runtime and
			
 
				-  periodically uploads experts into `DHT`.
			
 
				-
			
 
				-**DHT:**
			
 
				-
			
 
				-- **`DHT`**(`hivemind/dht/__init__.py`) is a node of
			
 
				-  Kademlia-based DHT that stores metadata used by trainer and runtime.
			
 
				-
			
 
				-## Limitations
			
 
				-
			
 
				-**DHT**:
			
 
				-
			
 
				-- DHT functionality is severely limited by its inability to traverse NAT.
			
 
				-- Because of this all the features that require DHT are in deep pre-alpha state
			
 
				-  and cannot be used without special setup.
			
 
				-
			
 
				-**Runtime**:
			
 
				-* You can achieve 4x less network load by passing quantized uint8 activations across experts.
			
 
				-    Implement your own quantization or wait for hivemind v0.8.
			
 
				-* Currently runtime can form batches that exceed maximal batch_size by task_size - 1. 
			
 
				-    We will fix that in the nearest patch.
			
 
				+# Quickstart
			
 
				+
			
 
				+This tutorial will teach you how to install `hivemind`, host your own experts and train them remotely.
			
 
				+
			
 
				+
			
 
				+#### Installation
			
 
				+
			
 
				+Just `pip install hivemind` to get the latest release. 
			
 
				+
			
 
				+You can also install the bleeding edge version from github:
			
 
				+```
			
 
				+git clone https://github.com/learning-at-home/hivemind
			
 
				+cd hivemind
			
 
				+python setup.py install
			
 
				+```
			
 
				+
			
 
				+You can also install it in editable mode with `python setup.py develop`.
			
 
				+
			
 
				+* __Dependencies:__ Hivemind requires python 3.7+ (3.8 is recommended), it will install [requirements](https://github.com/learning-at-home/hivemind/blob/master/requirements.txt) automatically; 
			
 
				+* __OS support:__ Linux and Mac OS should [just work](https://github.com/learning-at-home/hivemind/issues).
			
 
				+We do not officially support Windows, but you are welcome to try and contribute your windows build :)
			
 
				+
			
 
				+
			
 
				+#### Host a server
			
 
				+
			
 
				+Hivemind.Server hosts one or several experts (torch modules) for remote access. These experts are responsible for 
			
 
				+most of the model parameters and computation. The server can be started using either python or 
			
 
				+[a shell script](https://github.com/learning-at-home/hivemind/blob/master/scripts/run_server.py). We'll use the shell for now. 
			
 
				+To host a server with default experts, run this in your shell:
			
 
				+```sh
			
 
				+python scripts/run_server.py --expert_cls ffn --hidden_dim 512 --num_experts 5 --expert_pattern expert.[0:5] \
			
 
				+                             --listen_on 0.0.0.0:1337 --dht_port 1338
			
 
				+# note: if you omit listen_on and/or dht_port, they will be chosen automatically and printed to stdout.
			
 
				+```
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary><i>Console outputs</i></summary>
			
 
				+  
			
 
				+  ```sh
			
 
				+[2020/08/26 11:54:52.645][INFO][server.create:101] Bootstrapping DHT node, initial peers = []
			
 
				+[2020/08/26 11:54:52.660][INFO][server.create:105] Running dht node on port 1338
			
 
				+[2020/08/26 11:54:53.182][INFO][server.task_pool.run:130] expert.0_forward starting, pid=19382
			
 
				+[2020/08/26 11:54:53.182][INFO][server.task_pool.run:130] expert.0_forward starting, pid=19382
			
 
				+[2020/08/26 11:54:53.189][INFO][server.task_pool.run:130] expert.0_backward starting, pid=19384
			
 
				+[2020/08/26 11:54:53.189][INFO][server.task_pool.run:130] expert.0_backward starting, pid=19384
			
 
				+[2020/08/26 11:54:53.196][INFO][server.task_pool.run:130] expert.1_forward starting, pid=19386
			
 
				+[2020/08/26 11:54:53.196][INFO][server.task_pool.run:130] expert.1_forward starting, pid=19386
			
 
				+[2020/08/26 11:54:53.206][INFO][server.task_pool.run:130] expert.1_backward starting, pid=19388
			
 
				+[2020/08/26 11:54:53.206][INFO][server.task_pool.run:130] expert.1_backward starting, pid=19388
			
 
				+[2020/08/26 11:54:53.212][INFO][server.task_pool.run:130] expert.2_forward starting, pid=19390
			
 
				+[2020/08/26 11:54:53.212][INFO][server.task_pool.run:130] expert.2_forward starting, pid=19390
			
 
				+[2020/08/26 11:54:53.218][INFO][server.task_pool.run:130] expert.2_backward starting, pid=19392
			
 
				+[2020/08/26 11:54:53.218][INFO][server.task_pool.run:130] expert.2_backward starting, pid=19392
			
 
				+[2020/08/26 11:54:53.225][INFO][server.task_pool.run:130] expert.3_forward starting, pid=19394
			
 
				+[2020/08/26 11:54:53.225][INFO][server.task_pool.run:130] expert.3_forward starting, pid=19394
			
 
				+[2020/08/26 11:54:53.232][INFO][server.task_pool.run:130] expert.3_backward starting, pid=19396
			
 
				+[2020/08/26 11:54:53.232][INFO][server.task_pool.run:130] expert.3_backward starting, pid=19396
			
 
				+[2020/08/26 11:54:53.235][INFO][server.task_pool.run:130] expert.4_forward starting, pid=19398
			
 
				+[2020/08/26 11:54:53.235][INFO][server.task_pool.run:130] expert.4_forward starting, pid=19398
			
 
				+[2020/08/26 11:54:53.241][INFO][server.task_pool.run:130] expert.4_backward starting, pid=19400
			
 
				+[2020/08/26 11:54:53.241][INFO][server.task_pool.run:130] expert.4_backward starting, pid=19400
			
 
				+[2020/08/26 11:54:53.244][INFO][server.runtime.run:60] Started
			
 
				+[2020/08/26 11:54:53.244][INFO][server.runtime.run:60] Started
			
 
				+[2020/08/26 11:54:53.245][INFO][server.create:136] Server started at 0.0.0.0:1337
			
 
				+[2020/08/26 11:54:53.245][INFO][server.create:137] Got 5 active experts of type ffn: ['expert.0', 'expert.1', 'expert.2', 'expert.3', 'expert.4']
			
 
				+  ```
			
 
				+</details>
			
 
				+
			
 
				+
			
 
				+This server accepts requests to experts on port 1337 and start a DHT peer on port 1338.
			
 
				+In total, it serves 5 feedforward experts with ReLU and LayerNorm
			
 
				+ (see architecture [here](https://github.com/learning-at-home/hivemind/blob/master/hivemind/server/layers/__init__.py#L7-L21)).
			
 
				+
			
 
				+You can create additional servers in the same decentralized network using `--initial_peers` argument:
			
 
				+```sh
			
 
				+python scripts/run_server.py --expert_cls ffn --hidden_dim 512 --num_experts 10 --expert_pattern "expert.[5:250]" \
			
 
				+                              --initial_peers localhost:1338
			
 
				+```
			
 
				+<details style="margin-top:-24px; margin-bottom: 16px;">
			
 
				+  <summary>Console outputs</summary>
			
 
				+  
			
 
				+  ```sh
			
 
				+[2020/08/26 13:15:05.078][INFO][server.create:103] Bootstrapping DHT node, initial peers = ['localhost:1338']
			
 
				+[2020/08/26 13:15:05.101][INFO][server.create:107] Running dht node on port 44291
			
 
				+expert.[5:250]
			
 
				+[2020/08/26 13:15:06.326][INFO][server.task_pool.run:130] expert.113_forward starting, pid=29517
			
 
				+[2020/08/26 13:15:06.326][INFO][server.task_pool.run:130] expert.113_forward starting, pid=29517
			
 
				+[2020/08/26 13:15:06.333][INFO][server.task_pool.run:130] expert.113_backward starting, pid=29519
			
 
				+[2020/08/26 13:15:06.333][INFO][server.task_pool.run:130] expert.113_backward starting, pid=29519
			
 
				+[2020/08/26 13:15:06.340][INFO][server.task_pool.run:130] expert.149_forward starting, pid=29521
			
 
				+[2020/08/26 13:15:06.340][INFO][server.task_pool.run:130] expert.149_forward starting, pid=29521
			
 
				+[2020/08/26 13:15:06.352][INFO][server.task_pool.run:130] expert.149_backward starting, pid=29523
			
 
				+[2020/08/26 13:15:06.352][INFO][server.task_pool.run:130] expert.149_backward starting, pid=29523
			
 
				+[2020/08/26 13:15:06.363][INFO][server.task_pool.run:130] expert.185_forward starting, pid=29525
			
 
				+[2020/08/26 13:15:06.363][INFO][server.task_pool.run:130] expert.185_forward starting, pid=29525
			
 
				+[2020/08/26 13:15:06.375][INFO][server.task_pool.run:130] expert.185_backward starting, pid=29527
			
 
				+[2020/08/26 13:15:06.375][INFO][server.task_pool.run:130] expert.185_backward starting, pid=29527
			
 
				+[2020/08/26 13:15:06.381][INFO][server.task_pool.run:130] expert.189_forward starting, pid=29529
			
 
				+[2020/08/26 13:15:06.381][INFO][server.task_pool.run:130] expert.189_forward starting, pid=29529
			
 
				+[2020/08/26 13:15:06.388][INFO][server.task_pool.run:130] expert.189_backward starting, pid=29531
			
 
				+[2020/08/26 13:15:06.388][INFO][server.task_pool.run:130] expert.189_backward starting, pid=29531
			
 
				+[2020/08/26 13:15:06.400][INFO][server.task_pool.run:130] expert.191_forward starting, pid=29533
			
 
				+[2020/08/26 13:15:06.400][INFO][server.task_pool.run:130] expert.191_forward starting, pid=29533
			
 
				+[2020/08/26 13:15:06.407][INFO][server.task_pool.run:130] expert.191_backward starting, pid=29535
			
 
				+[2020/08/26 13:15:06.407][INFO][server.task_pool.run:130] expert.191_backward starting, pid=29535
			
 
				+[2020/08/26 13:15:06.415][INFO][server.task_pool.run:130] expert.196_forward starting, pid=29537
			
 
				+[2020/08/26 13:15:06.415][INFO][server.task_pool.run:130] expert.196_forward starting, pid=29537
			
 
				+[2020/08/26 13:15:06.426][INFO][server.task_pool.run:130] expert.196_backward starting, pid=29539
			
 
				+[2020/08/26 13:15:06.426][INFO][server.task_pool.run:130] expert.196_backward starting, pid=29539
			
 
				+[2020/08/26 13:15:06.435][INFO][server.task_pool.run:130] expert.225_forward starting, pid=29541
			
 
				+[2020/08/26 13:15:06.435][INFO][server.task_pool.run:130] expert.225_forward starting, pid=29541
			
 
				+[2020/08/26 13:15:06.445][INFO][server.task_pool.run:130] expert.225_backward starting, pid=29543
			
 
				+[2020/08/26 13:15:06.445][INFO][server.task_pool.run:130] expert.225_backward starting, pid=29543
			
 
				+[2020/08/26 13:15:06.454][INFO][server.task_pool.run:130] expert.227_forward starting, pid=29545
			
 
				+[2020/08/26 13:15:06.454][INFO][server.task_pool.run:130] expert.227_forward starting, pid=29545
			
 
				+[2020/08/26 13:15:06.467][INFO][server.task_pool.run:130] expert.227_backward starting, pid=29547
			
 
				+[2020/08/26 13:15:06.467][INFO][server.task_pool.run:130] expert.227_backward starting, pid=29547
			
 
				+[2020/08/26 13:15:06.475][INFO][server.task_pool.run:130] expert.36_forward starting, pid=29549
			
 
				+[2020/08/26 13:15:06.475][INFO][server.task_pool.run:130] expert.36_forward starting, pid=29549
			
 
				+[2020/08/26 13:15:06.482][INFO][server.task_pool.run:130] expert.36_backward starting, pid=29551
			
 
				+[2020/08/26 13:15:06.482][INFO][server.task_pool.run:130] expert.36_backward starting, pid=29551
			
 
				+[2020/08/26 13:15:06.497][INFO][server.task_pool.run:130] expert.58_forward starting, pid=29553
			
 
				+[2020/08/26 13:15:06.497][INFO][server.task_pool.run:130] expert.58_forward starting, pid=29553
			
 
				+[2020/08/26 13:15:06.507][INFO][server.task_pool.run:130] expert.58_backward starting, pid=29555
			
 
				+[2020/08/26 13:15:06.507][INFO][server.task_pool.run:130] expert.58_backward starting, pid=29555
			
 
				+[2020/08/26 13:15:06.509][INFO][server.runtime.run:60] Started
			
 
				+[2020/08/26 13:15:06.509][INFO][server.runtime.run:60] Started
			
 
				+[2020/08/26 13:15:06.510][INFO][server.create:166] Server started at 0.0.0.0:40089
			
 
				+[2020/08/26 13:15:06.510][INFO][server.create:167] Got 10 active experts of type ffn: ['expert.113', 'expert.149', 'expert.185', 'expert.189', 'expert.191', 'expert.196', 'expert.225', 'expert.227', 'expert.36', 'expert.58']
			
 
				+```
			
 
				+</details>
			
 
				+
			
 
				+Here and below, if you are running on a different machine, replace `localhost:1338` with your original server's
			
 
				+public IP address (e.g. `12.34.56.78:1338`). Hivemind supports both ipv4 and ipv6 protocols and uses the same notation
			
 
				+as [gRPC](https://grpc.io/docs/languages/python/basics/#starting-the-server).
			
 
				+
			
 
				+#### Run the experts
			
 
				+
			
 
				+Now let's put these experts to work. Create a python console (or a jupyter) and run: 
			
 
				+```python
			
 
				+import torch
			
 
				+import hivemind
			
 
				+
			
 
				+dht = hivemind.DHT(initial_peers=["localhost:1338"], listen=False, start=True)
			
 
				+# note: listen=False means that your peer will operate in "client only" mode: 
			
 
				+# this means that it can request other peers, but will not accept requests in return 
			
 
				+
			
 
				+expert1, expert4 = dht.get_experts(["expert.1", "expert.4"])
			
 
				+assert expert1 is not None and expert4 is not None, "server hasn't declared experts (yet?)"
			
 
				+```
			
 
				+
			
 
				+The experts (e.g. `expert1`) can be used as a pytorch module with autograd support:
			
 
				+```python
			
 
				+dummy = torch.randn(3, 512)
			
 
				+out = expert1(dummy)  # forward pass
			
 
				+out.sum().backward()  # backward pass
			
 
				+```
			
 
				+
			
 
				+When called, expert1 will submit a request to the corresponding server (which you created above) and return
			
 
				+ the outputs tensor(s) or raise an exception. During backward, pytorch will submit the backward requests
			
 
				+ for the experts as they appear in the computation graph.
			
 
				+ 
			
 
				+By default, the experts will automatically update their parameters with one step of SGD after each backward pass.
			
 
				+This allows you to quickly run training using both local and remote layers:
			
 
				+```python
			
 
				+# generate dummy data
			
 
				+x = torch.randn(3, 512)
			
 
				+y = 0.01 * x.sum(dim=-1, keepdim=True)
			
 
				+
			
 
				+# local torch module
			
 
				+proj_out = torch.nn.Sequential(
			
 
				+    torch.nn.Linear(512, 3)
			
 
				+)
			
 
				+opt = torch.optim.SGD(proj_out.parameters(), lr=0.01)
			
 
				+
			
 
				+for i in range(100):
			
 
				+    prediction = proj_out(expert1(expert4(x)))
			
 
				+    loss = torch.mean(abs(prediction - y))
			
 
				+    print(loss.item())
			
 
				+    opt.zero_grad()
			
 
				+    loss.backward()
			
 
				+    opt.step()
			
 
				+```
			
 
				+
			
 
				+Finally, you can create a Mixture-of-Experts layer over our humble band of experts:
			
 
				+```python
			
 
				+import nest_asyncio;  nest_asyncio.apply()  # asyncio patch for jupyter. for now, we recommend using MoE from console
			
 
				+dmoe = hivemind.RemoteMixtureOfExperts(in_features=512, uid_prefix="expert", grid_size=(5,),
			
 
				+                                       dht=dht, k_best=2)
			
 
				+
			
 
				+out = dmoe(torch.randn(3, 512))
			
 
				+out.sum().backward()
			
 
				+```
			
 
				+
			
 
				+The `dmoe` layer dynamically selects the right experts using a linear gating function. It will then dispatch parallel
			
 
				+forward (and backward) requests to those experts and collect results.
			
 
				+You can find more details on how MoE works in Section 2.3 of the [paper](https://arxiv.org/abs/2002.04013)
			
 
				+
			
 
				+Congratulations, you've made it through the basic tutorial. Give yourself a pat on the back :)
			
 
				+
			
 
				+More advanced tutorials are coming soon :)
			
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -1,10 +1,13 @@
 
				+from __future__ import annotations
			
 
				 import multiprocessing as mp
			
 
				 import multiprocessing.synchronize
			
 
				 import threading
			
 
				+import random
			
 
				 from contextlib import contextmanager
			
 
				+from functools import partial
			
 
				 
			
 
				 import torch
			
 
				-from typing import Dict, Optional, Tuple
			
 
				+from typing import Dict, Optional, Tuple, List
			
 
				 
			
 
				 import hivemind
			
 
				 from hivemind.dht import DHT
			
@@ -62,48 +65,56 @@ class Server(threading.Thread):
 
				             self.run_in_background(await_ready=True)
			
 
				 
			
 
				     @staticmethod
			
 
				-    def create(listen_on='0.0.0.0:*', num_experts=None, expert_uids=None, expert_cls='ffn', hidden_dim=1024,
			
 
				-               num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None,
			
 
				-               no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, verbose=True,
			
 
				-               start=False, **kwargs):  # removed type specification (-> Server)
			
 
				+    def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None,
			
 
				+               expert_cls='ffn', hidden_dim=1024, Optimizer=torch.optim.Adam, num_handlers=None, max_batch_size=4096,
			
 
				+               device=None, no_dht=False, initial_peers=(), dht_port=None, verbose=True,
			
 
				+               *, start: bool, **kwargs) -> Server:
			
 
				         """
			
 
				         Instantiate a server with several identical experts. See argparse comments below for details
			
 
				         :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
			
 
				         :param num_experts: run this many identical experts
			
 
				-        :param expert_prefix: all expert uids will be {expert_prefix}.{index}
			
 
				-        :param expert_offset: expert uid will use indices in range(expert_offset, expert_offset + num_experts)
			
 
				-        :param expert_uids: spawn experts with these exact uids, overrides num_experts, expert_prefix and expert_offset
			
 
				+        :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
			
 
				+         means "sample random experts between myprefix.0.0 and myprefix.255.255;
			
 
				+        :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern
			
 
				         :param expert_cls: expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
			
 
				         :param hidden_dim: main dimension for expert_cls
			
 
				         :param num_handlers: server will use this many parallel processes to handle incoming requests
			
 
				         :param max_batch_size: total num examples in the same batch will not exceed this value
			
 
				         :param device: all experts will use this device in torch notation; default: cuda if available else cpu
			
 
				-        :param no_optimizer: if specified, all optimizers use learning rate=0
			
 
				+        :param Optimizer: uses this optimizer to train all experts
			
 
				         :param no_dht: if specified, the server will not be attached to a dht
			
 
				-        :param initial_peers: a list of peers that will introduce this node to the dht,
			
 
				-        e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers
			
 
				+        :param initial_peers: a list of peers that will introduce this node to the dht,\
			
 
				+         e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers
			
 
				         :param dht_port:  DHT node will listen on this port, default = find open port
			
 
				         You can then use this node as initial peer for subsequent servers.
			
 
				         :param verbose: whether to print server started / finished / terminated events
			
 
				         :param start: if True, starts server right away and returns when server is ready for requests
			
 
				         """
			
 
				-        assert (expert_uids is None) != (num_experts is None and expert_prefix == 'expert' and expert_offset == 0), \
			
 
				-            "Please provide either expert uids *or* (num_experts, expert_prefix and expert_offset), not both"
			
 
				         if verbose and len(kwargs) != 0:
			
 
				             print("Ignored kwargs:", kwargs)
			
 
				         assert expert_cls in name_to_block
			
 
				-        num_handlers = num_handlers if num_handlers is not None else num_experts * 8
			
 
				-        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
			
 
				 
			
 
				         # initialize dht
			
 
				         dht = None
			
 
				         if not no_dht:
			
 
				-            logger.info("Bootstrapping DHT node, initial peers =", initial_peers)
			
 
				+            logger.info(f"Bootstrapping DHT node, initial peers = {initial_peers}")
			
 
				             dht = hivemind.DHT(initial_peers=initial_peers, start=True,
			
 
				                                listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}")
			
 
				             if verbose:
			
 
				                 logger.info(f"Running dht node on port {dht.port}")
			
 
				 
			
 
				+        # get expert uids
			
 
				+        assert (expert_pattern is None and num_experts is None) or (expert_uids is None), \
			
 
				+            "Please provide either expert_uids *or* num_experts and expert_pattern, but not both"
			
 
				+        if expert_uids is None:
			
 
				+            assert num_experts is not None, "Please specify either expert_uids or num_experts [and expert_pattern]"
			
 
				+            expert_uids = generate_uids_from_pattern(num_experts, expert_pattern, dht=dht)
			
 
				+
			
 
				+        num_experts = len(expert_uids)
			
 
				+        num_handlers = num_handlers if num_handlers is not None else num_experts * 8
			
 
				+        Optimizer = Optimizer if Optimizer is not None else partial(torch.optim.SGD, lr=0.0)
			
 
				+        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
			
 
				+
			
 
				         sample_input = name_to_input[expert_cls](4, hidden_dim)
			
 
				         if isinstance(sample_input, tuple):
			
 
				             args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg) for arg in sample_input)
			
@@ -111,18 +122,14 @@ class Server(threading.Thread):
 
				             args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input),)
			
 
				 
			
 
				         # initialize experts
			
 
				-        if expert_uids is None:
			
 
				-            num_experts = num_experts if num_experts is not None else 1
			
 
				-            expert_uids = [f'{expert_prefix}{hivemind.DHT.UID_DELIMITER}{i + expert_offset}'
			
 
				-                           for i in range(num_experts)]
			
 
				 
			
 
				         experts = {}
			
 
				         for expert_uid in expert_uids:
			
 
				             expert = name_to_block[expert_cls](hidden_dim)
			
 
				-            opt = torch.optim.SGD(expert.parameters(), 0.0 if no_optimizer else 0.05)
			
 
				-            experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, opt=opt,
			
 
				+            experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
			
 
				                                                          args_schema=args_schema,
			
 
				                                                          outputs_schema=hivemind.BatchTensorDescriptor(hidden_dim),
			
 
				+                                                         opt=Optimizer(expert.parameters()),
			
 
				                                                          max_batch_size=max_batch_size,
			
 
				                                                          )
			
 
				         # actually start server
			
@@ -245,4 +252,66 @@ def _server_runner(pipe, *args, verbose, **kwargs):
 
				         server.shutdown()
			
 
				         server.join()
			
 
				         if verbose:
			
 
				-            logger.info("Server shut down successfully.")
			
 
				+            logger.info("Server shut down successfully.")
			
 
				+
			
 
				+
			
 
				+def generate_uids_from_pattern(num_experts: int, expert_pattern: Optional[str], dht: Optional[DHT] = None,
			
 
				+                               attempts_per_expert=10) -> List[str]:
			
 
				+    """
			
 
				+    Sample experts from a given pattern, remove duplicates.
			
 
				+    :param num_experts: sample this many unique expert uids
			
 
				+    :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
			
 
				+     means "sample random experts between myprefix.0.0 and myprefix.255.255;
			
 
				+    :param dht: if specified, uses this DHT to check that expert uids are not yet occupied by other peers
			
 
				+    :param attempts_per_expert: give up if unable to generate a new expert uid after this many attempts per uid
			
 
				+    :note: this method is not strictly process-safe. If several servers run it concurrently, they have
			
 
				+     a small chance of sampling duplicate expert uids.
			
 
				+    """
			
 
				+    logger.info("Generating expert uids...")
			
 
				+    remaining_attempts = attempts_per_expert * num_experts
			
 
				+    found_uids, attempted_uids = list(), set()
			
 
				+
			
 
				+    def _generate_uid():
			
 
				+        if expert_pattern is None:
			
 
				+            return f"expert{hivemind.DHT.UID_DELIMITER}{attempts_per_expert * num_experts - remaining_attempts}"
			
 
				+
			
 
				+        uid = []
			
 
				+        for block in expert_pattern.split(hivemind.DHT.UID_DELIMITER):
			
 
				+            try:
			
 
				+                if '[' not in block and ']' not in block:
			
 
				+                    uid.append(block)
			
 
				+                elif block.startswith('[') and block.endswith(']') and ':' in block:
			
 
				+                    slice_start, slice_end = map(int, block[1:-1].split(':'))
			
 
				+                    uid.append(str(random.randint(slice_start, slice_end - 1)))
			
 
				+                else:
			
 
				+                    raise ValueError("Block must be either fixed or a range [from:to]")
			
 
				+            except KeyboardInterrupt as e:
			
 
				+                raise e
			
 
				+            except Exception as e:
			
 
				+                raise ValueError(f"Expert pattern {expert_pattern} has invalid block {block} , {e}")
			
 
				+        return hivemind.DHT.UID_DELIMITER.join(uid)
			
 
				+
			
 
				+    while remaining_attempts > 0 and len(found_uids) < num_experts:
			
 
				+
			
 
				+        # 1. sample new expert uids at random
			
 
				+        new_uids = []
			
 
				+        while len(new_uids) + len(found_uids) < num_experts and remaining_attempts > 0:
			
 
				+            new_uid = _generate_uid()
			
 
				+            remaining_attempts -= 1
			
 
				+            if new_uid not in attempted_uids:
			
 
				+                attempted_uids.add(new_uid)
			
 
				+                new_uids.append(new_uid)
			
 
				+
			
 
				+        # 2. look into DHT (if given) and remove duplicates
			
 
				+        if dht:
			
 
				+            existing_expert_uids = {found_expert.uid for found_expert in dht.get_experts(new_uids)
			
 
				+                                    if found_expert is not None}
			
 
				+            new_uids = [new_uid for new_uid in new_uids if new_uid not in existing_expert_uids]
			
 
				+
			
 
				+        found_uids += new_uids
			
 
				+
			
 
				+    if len(found_uids) != num_experts:
			
 
				+        logger.warning(f"Found only {len(found_uids)} out of {num_experts} free expert uids after "
			
 
				+                       f"{attempts_per_expert * num_experts} attempts")
			
 
				+    return found_uids
			
 
				+
			
--- a/hivemind/server/dht_handler.py
+++ b/hivemind/server/dht_handler.py
@@ -16,5 +16,6 @@ class DHTHandlerThread(threading.Thread):
 
				         self.stop = threading.Event()
			
 
				 
			
 
				     def run(self) -> None:
			
 
				+        self.dht.declare_experts(self.experts.keys(), self.endpoint)
			
 
				         while not self.stop.wait(self.update_period):
			
 
				             self.dht.declare_experts(self.experts.keys(), self.endpoint)
			
--- a/scripts/config.yml
+++ b/scripts/config.yml
@@ -2,14 +2,11 @@ listen_on: 0.0.0.0:* #'localhost' for local connections only, '0.0.0.0' for ipv4
 
				 num_experts: 1 #run this many identical experts
			
 
				 expert_cls: ffn #expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'.
			
 
				 hidden_dim: 1024 #main dimension for expert_cls
			
 
				-#num_handlers:  #'server will use this many processes to handle incoming requests
			
 
				 expert_prefix: expert #all expert uids will be {expert_prefix}.{index}
			
 
				 expert_offset: 0 #expert uid will use indices in range(expert_offset, expert_offset + num_experts)
			
 
				 max_batch_size: 16384 #total num examples in the same batch will not exceed this value
			
 
				-#device:  #all experts will use this device in torch notation; default: cuda if available else cpu
			
 
				-no_optimizer: True #if specified, all optimizers use learning rate=0
			
 
				+optimizer: adam #if specified, all optimizers use learning rate=0
			
 
				 no_dht: True #if specified, the server will not be attached to a dht
			
 
				 initial_peers: "[]" #a list of peers that will introduce this node to the dht, e.g. [("1.2.3.4", 1337), ("127.0.0.1", 4321)]
			
 
				-#dht_port:  #DHT node will listen on this port
			
 
				-#root_port:  #If this server does not have peers, it will create a virtual dht node on this port. You can then use this node as initial peer.
			
 
				+#dht_port: none #DHT node will listen on this port
			
 
				 increase_file_limit: True #On *nix, this will increase the max number of processes a server can spawn before hitting "Too many open files"; Use at your own risk.
			
--- a/scripts/run_server.py
+++ b/scripts/run_server.py
@@ -1,39 +1,53 @@
 
				-from typing import Optional
			
 
				+from functools import partial
			
 
				+
			
 
				 import configargparse
			
 
				 import resource
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				 from hivemind.server import Server
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # fmt:off
			
 
				     parser = configargparse.ArgParser(default_config_files=["config.yml"])
			
 
				-    parser.add('-c', '--my-config', required=False, is_config_file=True, help='config file path')
			
 
				+    parser.add('-c', '--config', required=False, is_config_file=True, help='config file path')
			
 
				     parser.add_argument('--listen_on', type=str, default='0.0.0.0:*', required=False,
			
 
				                         help="'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6")
			
 
				-    parser.add_argument('--num_experts', type=int, default=1, required=False, help="run this many identical experts")
			
 
				+    parser.add_argument('--num_experts', type=int, default=None, required=False, help="run this many experts")
			
 
				+    parser.add_argument('--expert_pattern', type=str, default=None, required=False, help='all expert uids will follow'
			
 
				+                        ' this pattern, e.g. "myexpert.[0:256].[0:1024]" will sample random expert uids'
			
 
				+                        ' between myexpert.0.0 and myexpert.255.1023 . Use either num_experts and this or expert_uids')
			
 
				+    parser.add_argument('--expert_uids', type=str, nargs="*", default=None, required=False,
			
 
				+                        help="specify the exact list of expert uids to create. Use either this or num_experts"
			
 
				+                             " and expert_pattern, not both")
			
 
				     parser.add_argument('--expert_cls', type=str, default='ffn', required=False,
			
 
				                         help="expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'.")
			
 
				     parser.add_argument('--hidden_dim', type=int, default=1024, required=False, help='main dimension for expert_cls')
			
 
				     parser.add_argument('--num_handlers', type=int, default=None, required=False,
			
 
				                         help='server will use this many processes to handle incoming requests')
			
 
				-    parser.add_argument('--expert_prefix', type=str, default='expert', required=False,
			
 
				-                        help='all expert uids will be {expert_prefix}.{index}')
			
 
				-    parser.add_argument('--expert_offset', type=int, default=0, required=False,
			
 
				-                        help='expert uid will use indices in range(expert_offset, expert_offset + num_experts)')
			
 
				     parser.add_argument('--max_batch_size', type=int, default=16384, required=False,
			
 
				                         help='total num examples in the same batch will not exceed this value')
			
 
				     parser.add_argument('--device', type=str, default=None, required=False,
			
 
				                         help='all experts will use this device in torch notation; default: cuda if available else cpu')
			
 
				-    parser.add_argument('--no_optimizer', action='store_true', help='if specified, all optimizers use learning rate=0')
			
 
				+    parser.add_argument('--optimizer', type=str, default='adam', required=False, help='adam, sgd or none')
			
 
				     parser.add_argument('--no_dht', action='store_true', help='if specified, the server will not be attached to a dht')
			
 
				-    parser.add_argument('--initial_peers', type=str, default="[]", required=False, help='a list of peers that will'
			
 
				-                        ' introduce this node to the dht, e.g. [("1.2.3.4", 1337), ("127.0.0.1", 4321)]')
			
 
				+    parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[], help='one or more peers'
			
 
				+                        ' that can welcome you to the dht, e.g. 1.2.3.4:1337 192.132.231.4:4321')
			
 
				     parser.add_argument('--dht_port', type=int, default=None, required=False, help='DHT node will listen on this port')
			
 
				-    parser.add_argument('--root_port', type=int, default=None, required=False, help='If this server does not have peers'
			
 
				-                        ', it will create a virtual dht node on this port. You can then use this node as initial peer.')
			
 
				     parser.add_argument('--increase_file_limit', action='store_true', help='On *nix, this will increase the max number'
			
 
				                         ' of processes a server can spawn before hitting "Too many open files"; Use at your own risk.')
			
 
				     # fmt:on
			
 
				     args = vars(parser.parse_args())
			
 
				+    args.pop('config', None)
			
 
				+    optimizer = args.pop('optimizer')
			
 
				+    if optimizer == 'adam':
			
 
				+        Optimizer = torch.optim.Adam
			
 
				+    elif optimizer == 'sgd':
			
 
				+        Optimizer = partial(torch.optim.SGD, lr=0.01)
			
 
				+    elif optimizer == 'none':
			
 
				+        Optimizer = None
			
 
				+    else:
			
 
				+        raise ValueError("Optimizer must be adam, sgd or none")
			
 
				 
			
 
				     if args.pop('increase_file_limit'):
			
 
				         soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
			
@@ -43,10 +57,8 @@ if __name__ == '__main__':
 
				         except:
			
 
				             print("Could not increase open file limit, currently at soft={}, hard={}".format(soft, hard))
			
 
				 
			
 
				-    args['initial_peers'] = eval(args['initial_peers'])
			
 
				-
			
 
				     try:
			
 
				-        server = Server.create(**args, start=True, verbose=True)
			
 
				+        server = Server.create(**args, Optimizer=Optimizer, start=True, verbose=True)
			
 
				         server.join()
			
 
				     finally:
			
 
				-        server.shutdown()
			
 
				+        server.shutdown()
			
--- a/setup.py
+++ b/setup.py
@@ -53,9 +53,10 @@ setup(
 
				     name='hivemind',
			
 
				     version=version_string,
			
 
				     cmdclass={'install': ProtoCompileInstall, 'develop': ProtoCompileDevelop},
			
 
				-    description='',
			
 
				-    long_description='',
			
 
				-    author='Learning@home authors',
			
 
				+    description='Decentralized deep learning framework in pytorch.',
			
 
				+    long_description='Decentralized deep learning framework in pytorch. Built to train giant models on '
			
 
				+                     'thousands on volunteers across the world. ',
			
 
				+    author='Learning@home & contributors',
			
 
				     author_email='mryabinin@hse.ru',
			
 
				     url="https://github.com/learning-at-home/hivemind",
			
 
				     packages=find_packages(exclude=['tests']),
			
@@ -78,5 +79,5 @@ setup(
 
				         'Topic :: Software Development :: Libraries :: Python Modules',
			
 
				     ],
			
 
				     # What does your project relate to?
			
 
				-    keywords='pytorch, deep learning, machine learning, gpu, distributed computing',
			
 
				+    keywords='pytorch, deep learning, machine learning, gpu, distributed computing, volunteer computing, dht',
			
 
				 )
			
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -36,7 +36,7 @@ def test_call_many():
 
				     atol = 1e-6
			
 
				 
			
 
				     with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=64,
			
 
				-                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				+                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				         inputs = torch.randn(4, 64, requires_grad=True)
			
 
				         inputs_clone = inputs.clone().detach().requires_grad_(True)
			
 
				         e0, e1, e2, e3, e4 = [hivemind.RemoteExpert(f'expert.{i}', server_endpoint) for i in range(5)]
			
@@ -78,7 +78,7 @@ def test_call_many():
 
				 
			
 
				 def test_remote_module_call():
			
 
				     with background_server(num_experts=1, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=1024,
			
 
				-                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				+                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				         real_expert = hivemind.RemoteExpert('expert.0', server_endpoint)
			
 
				         fake_expert = hivemind.RemoteExpert('oiasfjiasjf', server_endpoint)
			
 
				 
			
@@ -130,7 +130,7 @@ def test_determinism():
 
				     mask = torch.randint(0, 1, (32, 1024))
			
 
				 
			
 
				     with background_server(num_experts=1, device='cpu', expert_cls='det_dropout', num_handlers=1,
			
 
				-                           no_optimizer=True, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				+                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
			
 
				         expert = hivemind.RemoteExpert(uid=f'expert.0', endpoint=server_endpoint)
			
 
				 
			
 
				         out = expert(xx, mask)
			
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,3 +1,4 @@
 
				+from functools import partial
			
 
				 from typing import Optional
			
 
				 
			
 
				 import torch
			
@@ -11,8 +12,9 @@ from hivemind import RemoteExpert, background_server
 
				 def test_training(port: Optional[int] = None, max_steps: int = 100, threshold: float = 0.9):
			
 
				     dataset = load_digits()
			
 
				     X_train, y_train = torch.tensor(dataset['data'], dtype=torch.float), torch.tensor(dataset['target'])
			
 
				+    SGD = partial(torch.optim.SGD, lr=0.05)
			
 
				 
			
 
				-    with background_server(num_experts=2, device='cpu', hidden_dim=64) as (server_endpoint, _):
			
 
				+    with background_server(num_experts=2, device='cpu', Optimzer=SGD, hidden_dim=64) as (server_endpoint, _):
			
 
				         expert1 = RemoteExpert('expert.0', server_endpoint)
			
 
				         expert2 = RemoteExpert('expert.1', server_endpoint)
			
 
				         model = nn.Sequential(expert2, nn.Tanh(), expert1, nn.Linear(64, 10))