4 年之前 · dfbc401196
--- a/.github/workflows/push-docker-image.yml
+++ b/.github/workflows/push-docker-image.yml
@@ -0,0 +1,55 @@
 
				+name: Push to Docker Hub
			
 
				+
			
 
				+on:
			
 
				+  push:
			
 
				+    branches: [ master ]
			
 
				+    tags:
			
 
				+      - "*.*.*"
			
 
				+  pull_request:
			
 
				+    branches: [ master ]
			
 
				+
			
 
				+
			
 
				+jobs:
			
 
				+  build:
			
 
				+    runs-on: ubuntu-latest
			
 
				+
			
 
				+    steps:
			
 
				+      - name: Checkout
			
 
				+        uses: actions/checkout@v2
			
 
				+
			
 
				+      - name: Docker meta
			
 
				+        id: meta
			
 
				+        uses: crazy-max/ghaction-docker-meta@v2
			
 
				+        with:
			
 
				+          # list of Docker images to use as base name for tags
			
 
				+          images: |
			
 
				+            learningathome/hivemind
			
 
				+          # generate Docker tags based on the following events/attributes
			
 
				+          tags: |
			
 
				+            type=ref,event=branch
			
 
				+            type=ref,event=pr
			
 
				+            type=semver,pattern={{version}}
			
 
				+            type=semver,pattern={{major}}.{{minor}}
			
 
				+            type=semver,pattern={{major}}
			
 
				+
			
 
				+      - name: Set up Docker Buildx
			
 
				+        id: buildx
			
 
				+        uses: docker/setup-buildx-action@v1
			
 
				+
			
 
				+      - name: Login to Docker Hub
			
 
				+        if: github.event_name != 'pull_request'
			
 
				+        uses: docker/login-action@v1
			
 
				+        with:
			
 
				+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
			
 
				+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
			
 
				+
			
 
				+      - name: Build and push
			
 
				+        id: docker_build
			
 
				+        uses: docker/build-push-action@v2
			
 
				+        with:
			
 
				+          context: .
			
 
				+          push: ${{ github.event_name != 'pull_request' }}
			
 
				+          tags: ${{ steps.meta.outputs.tags }}
			
 
				+
			
 
				+      - name: Image digest
			
 
				+        run: echo ${{ steps.docker_build.outputs.digest }}
			
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -122,9 +122,9 @@ Currently, hivemind has three benchmark scripts for evaluating the impact of cod
 
				 performance-sensitive parts of the library. If you make a change that might introduce a regression, you may be asked by
			
 
				 the maintainers to provide the benchmarking results for your branch and a comparison with the master branch.
			
 
				 
			
 
				-* `tests/benchmark_averaging.py` measures the performance of decentralized parameter averaging across the DHT.
			
 
				-* `tests/benchmark_dht.py` measures the performance of core DHT operations.
			
 
				-* `tests/benchmark_throughput.py` measures the performance of a server hosting several expert layers under heavy load
			
 
				+* `benchmarks/benchmark_averaging.py` measures the performance of decentralized parameter averaging across the DHT.
			
 
				+* `benchmarks/benchmark_dht.py` measures the performance of core DHT operations.
			
 
				+* `benchmarks/benchmark_throughput.py` measures the performance of a server hosting several expert layers under heavy load
			
 
				   from multiple clients.
			
 
				 
			
 
				 Example benchmark runs are available in
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,38 @@
 
				+FROM nvcr.io/nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
			
 
				+LABEL maintainer="Learning@home"
			
 
				+LABEL repository="hivemind"
			
 
				+
			
 
				+WORKDIR /home
			
 
				+# Set en_US.UTF-8 locale by default
			
 
				+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
			
 
				+
			
 
				+# Install packages
			
 
				+RUN apt-get update && apt-get install -y --no-install-recommends --force-yes \
			
 
				+  build-essential \
			
 
				+  wget \
			
 
				+  git \
			
 
				+  vim \
			
 
				+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
			
 
				+
			
 
				+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O install_miniconda.sh && \
			
 
				+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
			
 
				+ENV PATH="/opt/conda/bin:${PATH}"
			
 
				+
			
 
				+RUN conda install python~=3.8 pip && \
			
 
				+    pip install --no-cache-dir torch torchvision torchaudio && \
			
 
				+    conda clean --all
			
 
				+
			
 
				+COPY requirements.txt hivemind/requirements.txt
			
 
				+COPY requirements-dev.txt hivemind/requirements-dev.txt
			
 
				+COPY examples/albert/requirements.txt hivemind/examples/albert/requirements.txt
			
 
				+RUN pip install --no-cache-dir -r hivemind/requirements.txt && \
			
 
				+    pip install --no-cache-dir -r hivemind/requirements-dev.txt && \
			
 
				+    pip install --no-cache-dir -r hivemind/examples/albert/requirements.txt && \
			
 
				+    rm -rf ~/.cache/pip
			
 
				+
			
 
				+COPY . hivemind/
			
 
				+RUN cd hivemind && \
			
 
				+    pip install --no-cache-dir .[dev] && \
			
 
				+    conda clean --all && rm -rf ~/.cache/pip
			
 
				+
			
 
				+CMD bash
			
--- a/benchmarks/benchmark_averaging.py
+++ b/benchmarks/benchmark_averaging.py
@@ -4,6 +4,7 @@ import threading
 
				 import argparse
			
 
				 
			
 
				 import torch
			
 
				+
			
 
				 import hivemind
			
 
				 from hivemind.utils import LOCALHOST, increase_file_limit
			
 
				 from hivemind.proto import runtime_pb2
			
--- a/benchmarks/benchmark_dht.py
+++ b/benchmarks/benchmark_dht.py
--- a/benchmarks/benchmark_tensor_compression.py
+++ b/benchmarks/benchmark_tensor_compression.py
@@ -1,5 +1,6 @@
 
				-import time
			
 
				 import argparse
			
 
				+import time
			
 
				+
			
 
				 import torch
			
 
				 
			
 
				 from hivemind.proto.runtime_pb2 import CompressionType
			
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -5,7 +5,6 @@ import sys
 
				 import time
			
 
				 
			
 
				 import torch
			
 
				-from test_utils import print_device_info
			
 
				 
			
 
				 import hivemind
			
 
				 from hivemind import find_open_port
			
@@ -13,6 +12,19 @@ from hivemind.server import layers
 
				 from hivemind.utils.threading import increase_file_limit
			
 
				 
			
 
				 
			
 
				+def print_device_info(device=None):
			
 
				+    """Prints device stats. Code from https://stackoverflow.com/a/53374933/12891528"""
			
 
				+    device = torch.device(device or ('cuda' if torch.cuda.is_available() else 'cpu'))
			
 
				+    print('Using device:', device)
			
 
				+
			
 
				+    # Additional Info when using cuda
			
 
				+    if device.type == 'cuda':
			
 
				+        print(torch.cuda.get_device_name(0))
			
 
				+        print('Memory Usage:')
			
 
				+        print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
			
 
				+        print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024 ** 3, 1), 'GB')
			
 
				+
			
 
				+
			
 
				 def client_process(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches, backprop=True):
			
 
				     torch.set_num_threads(1)
			
 
				     can_start.wait()
			
@@ -65,7 +77,8 @@ def benchmark_throughput(num_experts=16, num_handlers=None, num_clients=128, num
 
				         for i in range(num_experts):
			
 
				             expert = torch.jit.script(layers.name_to_block[expert_cls](hid_dim))
			
 
				             experts[f'expert{i}'] = hivemind.ExpertBackend(name=f'expert{i}',
			
 
				-                                                           expert=expert, optimizer=torch.optim.Adam(expert.parameters()),
			
 
				+                                                           expert=expert,
			
 
				+                                                           optimizer=torch.optim.Adam(expert.parameters()),
			
 
				                                                            args_schema=(hivemind.BatchTensorDescriptor(hid_dim),),
			
 
				                                                            outputs_schema=hivemind.BatchTensorDescriptor(hid_dim),
			
 
				                                                            max_batch_size=max_batch_size,
			
--- a/docs/user/benchmarks.md
+++ b/docs/user/benchmarks.md
@@ -5,7 +5,7 @@ hivemind.
 
				 
			
 
				 ### Server throughput
			
 
				 
			
 
				-You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py) to
			
 
				+You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/benchmarks/benchmark_throughput.py) to
			
 
				 check the performance impact of your changes to hivemind.client and server. The benchmark will start one server without
			
 
				 DHT with several experts, and then spawn trainer processes that load the server with requests. The two main statistics
			
 
				 in this benchmark samples/s and startup time.
			
@@ -59,7 +59,7 @@ Cached:    3.2 GB
 
				 
			
 
				 ### DHT performance
			
 
				 
			
 
				-In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_dht.py) can be used
			
 
				+In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/benchmarks/benchmark_dht.py) can be used
			
 
				 to measure performance impact of changes to hivemind.dht. It spawns a DHT with `num_peers` participants, then chooses
			
 
				 one peer that will declare `num_experts` total experts in batches of `expert_batch_size`. Then, another peer will
			
 
				 consecutively get all peers and check if they are there.
			
--- a/tests/test_custom_experts.py
+++ b/tests/test_custom_experts.py
@@ -5,13 +5,15 @@ import torch
 
				 
			
 
				 from hivemind import RemoteExpert, background_server
			
 
				 
			
 
				+CUSTOM_EXPERTS_PATH = os.path.join(os.path.dirname(__file__), 'test_utils', 'custom_networks.py')
			
 
				+
			
 
				 
			
 
				 @pytest.mark.forked
			
 
				 def test_custom_expert(hid_dim=16):
			
 
				     with background_server(
			
 
				             expert_cls='perceptron', num_experts=2, device='cpu',
			
 
				             hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				-            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				+            custom_module_path=CUSTOM_EXPERTS_PATH) as (server_endpoint, _):
			
 
				         expert0 = RemoteExpert('expert.0', server_endpoint)
			
 
				         expert1 = RemoteExpert('expert.1', server_endpoint)
			
 
				 
			
@@ -32,7 +34,7 @@ def test_multihead_expert(hid_dim=16):
 
				     with background_server(
			
 
				             expert_cls='multihead', num_experts=2, device='cpu',
			
 
				             hidden_dim=hid_dim, num_handlers=2, no_dht=True,
			
 
				-            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
			
 
				+            custom_module_path=CUSTOM_EXPERTS_PATH) as (server_endpoint, _):
			
 
				         expert0 = RemoteExpert('expert.0', server_endpoint)
			
 
				         expert1 = RemoteExpert('expert.1', server_endpoint)
			
 
				 
			
--- a/tests/test_utils/__init__.py
+++ b/tests/test_utils/__init__.py
@@ -1,14 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-
			
 
				-def print_device_info(device=None):
			
 
				-    # prints device stats. Code from https://stackoverflow.com/a/53374933/12891528
			
 
				-    device = torch.device(device or ('cuda' if torch.cuda.is_available() else 'cpu'))
			
 
				-    print('Using device:', device)
			
 
				-
			
 
				-    # Additional Info when using cuda
			
 
				-    if device.type == 'cuda':
			
 
				-        print(torch.cuda.get_device_name(0))
			
 
				-        print('Memory Usage:')
			
 
				-        print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
			
 
				-        print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024 ** 3, 1), 'GB')
			
--- a/tests/test_utils/custom_networks.py
+++ b/tests/test_utils/custom_networks.py