Преглед на файлове

Add Dockerfile, refactor tests (#245)

* Add Dockerfile

* Move benchmarks to hivemind/benchmarks

* Move custom_networks to test_utils

* Add "Push to Docker Hub" GitHub action
Max Ryabinin преди 4 години
родител
ревизия
dfbc401196

+ 55 - 0
.github/workflows/push-docker-image.yml

@@ -0,0 +1,55 @@
+name: Push to Docker Hub
+
+on:
+  push:
+    branches: [ master ]
+    tags:
+      - "*.*.*"
+  pull_request:
+    branches: [ master ]
+
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Docker meta
+        id: meta
+        uses: crazy-max/ghaction-docker-meta@v2
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            learningathome/hivemind
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to Docker Hub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+
+      - name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}

+ 3 - 3
CONTRIBUTING.md

@@ -122,9 +122,9 @@ Currently, hivemind has three benchmark scripts for evaluating the impact of cod
 performance-sensitive parts of the library. If you make a change that might introduce a regression, you may be asked by
 the maintainers to provide the benchmarking results for your branch and a comparison with the master branch.
 
-* `tests/benchmark_averaging.py` measures the performance of decentralized parameter averaging across the DHT.
-* `tests/benchmark_dht.py` measures the performance of core DHT operations.
-* `tests/benchmark_throughput.py` measures the performance of a server hosting several expert layers under heavy load
+* `benchmarks/benchmark_averaging.py` measures the performance of decentralized parameter averaging across the DHT.
+* `benchmarks/benchmark_dht.py` measures the performance of core DHT operations.
+* `benchmarks/benchmark_throughput.py` measures the performance of a server hosting several expert layers under heavy load
   from multiple clients.
 
 Example benchmark runs are available in

+ 38 - 0
Dockerfile

@@ -0,0 +1,38 @@
+FROM nvcr.io/nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
+LABEL maintainer="Learning@home"
+LABEL repository="hivemind"
+
+WORKDIR /home
+# Set en_US.UTF-8 locale by default
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+# Install packages
+RUN apt-get update && apt-get install -y --no-install-recommends --force-yes \
+  build-essential \
+  wget \
+  git \
+  vim \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+ENV PATH="/opt/conda/bin:${PATH}"
+
+RUN conda install python~=3.8 pip && \
+    pip install --no-cache-dir torch torchvision torchaudio && \
+    conda clean --all
+
+COPY requirements.txt hivemind/requirements.txt
+COPY requirements-dev.txt hivemind/requirements-dev.txt
+COPY examples/albert/requirements.txt hivemind/examples/albert/requirements.txt
+RUN pip install --no-cache-dir -r hivemind/requirements.txt && \
+    pip install --no-cache-dir -r hivemind/requirements-dev.txt && \
+    pip install --no-cache-dir -r hivemind/examples/albert/requirements.txt && \
+    rm -rf ~/.cache/pip
+
+COPY . hivemind/
+RUN cd hivemind && \
+    pip install --no-cache-dir .[dev] && \
+    conda clean --all && rm -rf ~/.cache/pip
+
+CMD bash

+ 1 - 0
tests/benchmark_averaging.py → benchmarks/benchmark_averaging.py

@@ -4,6 +4,7 @@ import threading
 import argparse
 
 import torch
+
 import hivemind
 from hivemind.utils import LOCALHOST, increase_file_limit
 from hivemind.proto import runtime_pb2

+ 0 - 0
tests/benchmark_dht.py → benchmarks/benchmark_dht.py


+ 2 - 1
tests/benchmark_tensor_compression.py → benchmarks/benchmark_tensor_compression.py

@@ -1,5 +1,6 @@
-import time
 import argparse
+import time
+
 import torch
 
 from hivemind.proto.runtime_pb2 import CompressionType

+ 15 - 2
tests/benchmark_throughput.py → benchmarks/benchmark_throughput.py

@@ -5,7 +5,6 @@ import sys
 import time
 
 import torch
-from test_utils import print_device_info
 
 import hivemind
 from hivemind import find_open_port
@@ -13,6 +12,19 @@ from hivemind.server import layers
 from hivemind.utils.threading import increase_file_limit
 
 
+def print_device_info(device=None):
+    """Prints device stats. Code from https://stackoverflow.com/a/53374933/12891528"""
+    device = torch.device(device or ('cuda' if torch.cuda.is_available() else 'cpu'))
+    print('Using device:', device)
+
+    # Additional Info when using cuda
+    if device.type == 'cuda':
+        print(torch.cuda.get_device_name(0))
+        print('Memory Usage:')
+        print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
+        print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024 ** 3, 1), 'GB')
+
+
 def client_process(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches, backprop=True):
     torch.set_num_threads(1)
     can_start.wait()
@@ -65,7 +77,8 @@ def benchmark_throughput(num_experts=16, num_handlers=None, num_clients=128, num
         for i in range(num_experts):
             expert = torch.jit.script(layers.name_to_block[expert_cls](hid_dim))
             experts[f'expert{i}'] = hivemind.ExpertBackend(name=f'expert{i}',
-                                                           expert=expert, optimizer=torch.optim.Adam(expert.parameters()),
+                                                           expert=expert,
+                                                           optimizer=torch.optim.Adam(expert.parameters()),
                                                            args_schema=(hivemind.BatchTensorDescriptor(hid_dim),),
                                                            outputs_schema=hivemind.BatchTensorDescriptor(hid_dim),
                                                            max_batch_size=max_batch_size,

+ 2 - 2
docs/user/benchmarks.md

@@ -5,7 +5,7 @@ hivemind.
 
 ### Server throughput
 
-You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_throughput.py) to
+You can use [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/benchmarks/benchmark_throughput.py) to
 check the performance impact of your changes to hivemind.client and server. The benchmark will start one server without
 DHT with several experts, and then spawn trainer processes that load the server with requests. The two main statistics
 in this benchmark samples/s and startup time.
@@ -59,7 +59,7 @@ Cached:    3.2 GB
 
 ### DHT performance
 
-In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/tests/benchmark_dht.py) can be used
+In turn, [this benchmark](https://github.com/learning-at-home/hivemind/blob/master/benchmarks/benchmark_dht.py) can be used
 to measure performance impact of changes to hivemind.dht. It spawns a DHT with `num_peers` participants, then chooses
 one peer that will declare `num_experts` total experts in batches of `expert_batch_size`. Then, another peer will
 consecutively get all peers and check if they are there.

+ 4 - 2
tests/test_custom_expert.py → tests/test_custom_experts.py

@@ -5,13 +5,15 @@ import torch
 
 from hivemind import RemoteExpert, background_server
 
+CUSTOM_EXPERTS_PATH = os.path.join(os.path.dirname(__file__), 'test_utils', 'custom_networks.py')
+
 
 @pytest.mark.forked
 def test_custom_expert(hid_dim=16):
     with background_server(
             expert_cls='perceptron', num_experts=2, device='cpu',
             hidden_dim=hid_dim, num_handlers=2, no_dht=True,
-            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
+            custom_module_path=CUSTOM_EXPERTS_PATH) as (server_endpoint, _):
         expert0 = RemoteExpert('expert.0', server_endpoint)
         expert1 = RemoteExpert('expert.1', server_endpoint)
 
@@ -32,7 +34,7 @@ def test_multihead_expert(hid_dim=16):
     with background_server(
             expert_cls='multihead', num_experts=2, device='cpu',
             hidden_dim=hid_dim, num_handlers=2, no_dht=True,
-            custom_module_path=os.path.join(os.path.dirname(__file__), 'custom_networks.py')) as (server_endpoint, _):
+            custom_module_path=CUSTOM_EXPERTS_PATH) as (server_endpoint, _):
         expert0 = RemoteExpert('expert.0', server_endpoint)
         expert1 = RemoteExpert('expert.1', server_endpoint)
 

+ 0 - 14
tests/test_utils/__init__.py

@@ -1,14 +0,0 @@
-import torch
-
-
-def print_device_info(device=None):
-    # prints device stats. Code from https://stackoverflow.com/a/53374933/12891528
-    device = torch.device(device or ('cuda' if torch.cuda.is_available() else 'cpu'))
-    print('Using device:', device)
-
-    # Additional Info when using cuda
-    if device.type == 'cuda':
-        print(torch.cuda.get_device_name(0))
-        print('Memory Usage:')
-        print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
-        print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024 ** 3, 1), 'GB')

+ 0 - 0
tests/custom_networks.py → tests/test_utils/custom_networks.py