5 years ago · 70dadfb8b5
--- a/hivemind/__init__.py
+++ b/hivemind/__init__.py
@@ -3,4 +3,4 @@ from hivemind.dht import *
 
															 from hivemind.server import *
														
 
															 from hivemind.utils import *
														
 
															-__version__ = '0.8.5'
														
 
															+__version__ = '0.8.6'
														
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -328,7 +328,7 @@ class DHT(mp.Process):
 
															         :returns: a ordered dict{uid_prefix -> RemoteExpert} mapping at most :k: prefixes to matching experts
														
 
															             The keys in the returned dict are ordered same as in uid_prefixes.
														
 
															         """
														
 
															-        logger.warning("first_k_active is deprecated and will be removed in 0.8.6")
														
 
															+        logger.warning("first_k_active is deprecated and will be removed in 0.8.7")
														
 
															         assert not isinstance(uid_prefixes, str), "please provide a list/tuple of prefixes as the first argument"
														
 
															         future, _future = MPFuture.make_pair()
														
 
															         self.pipe.send(('_first_k_active', [],
														
--- a/hivemind/server/__init__.py
+++ b/hivemind/server/__init__.py
@@ -1,23 +1,24 @@
 
															 from __future__ import annotations
														
 
															+
														
 
															 import multiprocessing as mp
														
 
															 import multiprocessing.synchronize
														
 
															-import threading
														
 
															 import random
														
 
															+import threading
														
 
															 from contextlib import contextmanager
														
 
															 from functools import partial
														
 
															+from typing import Dict, Optional, Tuple, List
														
 
															 import torch
														
 
															-from typing import Dict, Optional, Tuple, List
														
 
															 import hivemind
														
 
															 from hivemind.dht import DHT
														
 
															-from hivemind.server.runtime import Runtime
														
 
															-from hivemind.server.task_pool import Task, TaskPool, TaskPoolBase
														
 
															-from hivemind.server.expert_backend import ExpertBackend
														
 
															 from hivemind.server.checkpoint_saver import CheckpointSaver
														
 
															 from hivemind.server.connection_handler import ConnectionHandler
														
 
															 from hivemind.server.dht_handler import DHTHandlerThread
														
 
															+from hivemind.server.expert_backend import ExpertBackend
														
 
															 from hivemind.server.layers import name_to_block, name_to_input
														
 
															+from hivemind.server.runtime import Runtime
														
 
															+from hivemind.server.task_pool import Task, TaskPool, TaskPoolBase
														
 
															 from hivemind.utils import Endpoint, get_port, replace_port, find_open_port, get_logger
														
 
															 logger = get_logger(__name__)
														
@@ -66,7 +67,7 @@ class Server(threading.Thread):
 
															     @staticmethod
														
 
															     def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None,
														
 
															-               expert_cls='ffn', hidden_dim=1024, Optimizer=torch.optim.Adam, num_handlers=None, max_batch_size=4096,
														
 
															+               expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, num_handlers=None, max_batch_size=4096,
														
 
															                device=None, no_dht=False, initial_peers=(), dht_port=None, verbose=True,
														
 
															                *, start: bool, **kwargs) -> Server:
														
 
															         """
														
@@ -76,12 +77,12 @@ class Server(threading.Thread):
 
															         :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
														
 
															          means "sample random experts between myprefix.0.0 and myprefix.255.255;
														
 
															         :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern
														
 
															-        :param expert_cls: expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
														
 
															+        :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
														
 
															         :param hidden_dim: main dimension for expert_cls
														
 
															         :param num_handlers: server will use this many parallel processes to handle incoming requests
														
 
															         :param max_batch_size: total num examples in the same batch will not exceed this value
														
 
															         :param device: all experts will use this device in torch notation; default: cuda if available else cpu
														
 
															-        :param Optimizer: uses this optimizer to train all experts
														
 
															+        :param optim_cls: uses this optimizer to train all experts
														
 
															         :param no_dht: if specified, the server will not be attached to a dht
														
 
															         :param initial_peers: a list of peers that will introduce this node to the dht,\
														
 
															          e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers
														
@@ -112,7 +113,7 @@ class Server(threading.Thread):
 
															         num_experts = len(expert_uids)
														
 
															         num_handlers = num_handlers if num_handlers is not None else num_experts * 8
														
 
															-        Optimizer = Optimizer if Optimizer is not None else partial(torch.optim.SGD, lr=0.0)
														
 
															+        optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0)
														
 
															         device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
														
 
															         sample_input = name_to_input[expert_cls](4, hidden_dim)
														
@@ -129,7 +130,7 @@ class Server(threading.Thread):
 
															             experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
														
 
															                                                          args_schema=args_schema,
														
 
															                                                          outputs_schema=hivemind.BatchTensorDescriptor(hidden_dim),
														
 
															-                                                         opt=Optimizer(expert.parameters()),
														
 
															+                                                         opt=optim_cls(expert.parameters()),
														
 
															                                                          max_batch_size=max_batch_size,
														
 
															                                                          )
														
 
															         # actually start server
														
@@ -314,4 +315,3 @@ def generate_uids_from_pattern(num_experts: int, expert_pattern: Optional[str],
 
															         logger.warning(f"Found only {len(found_uids)} out of {num_experts} free expert uids after "
														
 
															                        f"{attempts_per_expert * num_experts} attempts")
														
 
															     return found_uids
														
 
															-
														
--- a/hivemind/server/layers/__init__.py
+++ b/hivemind/server/layers/__init__.py
@@ -1,79 +1,7 @@
 
															 import torch
														
 
															-import torch.nn as nn
														
 
															-
														
 
															-from hivemind.server.layers.dropout import DeterministicDropout
														
 
															-
														
 
															-
														
 
															-class FeedforwardBlock(nn.Module):
														
 
															-    def __init__(self, hid_dim):
														
 
															-        super().__init__()
														
 
															-        self.layers = nn.Sequential(
														
 
															-            nn.Linear(hid_dim, 4 * hid_dim),
														
 
															-            nn.LayerNorm(4 * hid_dim),
														
 
															-            nn.ReLU(inplace=True),
														
 
															-            nn.Linear(4 * hid_dim, 4 * hid_dim),
														
 
															-            nn.LayerNorm(4 * hid_dim),
														
 
															-            nn.ReLU(inplace=True),
														
 
															-            nn.Linear(4 * hid_dim, hid_dim),
														
 
															-        )
														
 
															-
														
 
															-    def forward(self, x):
														
 
															-        return x + self.layers(x)
														
 
															-
														
 
															-
														
 
															-class TransformerEncoderLayer(nn.Module):
														
 
															-    """
														
 
															-    A slight modification of torch.nn.TransformerEncoderLayer which allows for torch.jit scripting
														
 
															-    """
														
 
															-
														
 
															-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
														
 
															-        super().__init__()
														
 
															-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
														
 
															-        # Implementation of Feedforward model
														
 
															-        self.linear1 = nn.Linear(d_model, dim_feedforward)
														
 
															-        self.dropout = nn.Dropout(dropout)
														
 
															-        self.linear2 = nn.Linear(dim_feedforward, d_model)
														
 
															-
														
 
															-        self.norm1 = nn.LayerNorm(d_model)
														
 
															-        self.norm2 = nn.LayerNorm(d_model)
														
 
															-        self.dropout1 = nn.Dropout(dropout)
														
 
															-        self.dropout2 = nn.Dropout(dropout)
														
 
															-
														
 
															-        self.activation = torch.nn.GELU()
														
 
															-
														
 
															-    def forward(self, src):
														
 
															-        src.transpose_(0, 1)
														
 
															-        src2 = self.self_attn(src, src, src)[0]
														
 
															-        src = src + self.dropout1(src2)
														
 
															-        src = self.norm1(src)
														
 
															-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
														
 
															-        src = src + self.dropout2(src2)
														
 
															-        src = self.norm2(src)
														
 
															-        src.transpose_(0, 1)
														
 
															-        return src
														
 
															-
														
 
															-
														
 
															-class NopExpert(nn.Sequential):
														
 
															-    def __init__(self, hid_dim):
														
 
															-        super().__init__()
														
 
															-        self.w = nn.Parameter(torch.zeros(0), requires_grad=True)
														
 
															-
														
 
															-    def forward(self, x):
														
 
															-        return x.clone()
														
 
															-
														
 
															-
														
 
															-class DeterministicDropoutNetwork(nn.Module):
														
 
															-    def __init__(self, hid_dim, dropout_prob):
														
 
															-        super().__init__()
														
 
															-        self.linear_in = nn.Linear(hid_dim, 2 * hid_dim)
														
 
															-        self.activation = nn.ReLU()
														
 
															-        self.dropout = DeterministicDropout(dropout_prob)
														
 
															-        self.linear_out = nn.Linear(2 * hid_dim, hid_dim)
														
 
															-
														
 
															-    def forward(self, x, mask):
														
 
															-        x = self.linear_in(self.dropout(x, mask))
														
 
															-        return self.linear_out(self.activation(x))
														
 
															+from hivemind.server.layers.common import FeedforwardBlock, TransformerEncoderLayer, NopExpert
														
 
															+from hivemind.server.layers.dropout import DeterministicDropout, DeterministicDropoutNetwork
														
 
															 name_to_block = {'ffn': lambda hid_dim: FeedforwardBlock(hid_dim),
														
 
															                  'transformer': lambda hid_dim: TransformerEncoderLayer(hid_dim, nhead=16),
														
--- a/hivemind/server/layers/common.py
+++ b/hivemind/server/layers/common.py
@@ -0,0 +1,60 @@
 
															+import torch
														
 
															+from torch import nn as nn
														
 
															+
														
 
															+
														
 
															+class FeedforwardBlock(nn.Module):
														
 
															+    def __init__(self, hid_dim):
														
 
															+        super().__init__()
														
 
															+        self.layers = nn.Sequential(
														
 
															+            nn.Linear(hid_dim, 4 * hid_dim),
														
 
															+            nn.LayerNorm(4 * hid_dim),
														
 
															+            nn.ReLU(inplace=True),
														
 
															+            nn.Linear(4 * hid_dim, 4 * hid_dim),
														
 
															+            nn.LayerNorm(4 * hid_dim),
														
 
															+            nn.ReLU(inplace=True),
														
 
															+            nn.Linear(4 * hid_dim, hid_dim),
														
 
															+        )
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        return x + self.layers(x)
														
 
															+
														
 
															+
														
 
															+class TransformerEncoderLayer(nn.Module):
														
 
															+    """
														
 
															+    A slight modification of torch.nn.TransformerEncoderLayer which allows for torch.jit scripting
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
														
 
															+        super().__init__()
														
 
															+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
														
 
															+        # Implementation of Feedforward model
														
 
															+        self.linear1 = nn.Linear(d_model, dim_feedforward)
														
 
															+        self.dropout = nn.Dropout(dropout)
														
 
															+        self.linear2 = nn.Linear(dim_feedforward, d_model)
														
 
															+
														
 
															+        self.norm1 = nn.LayerNorm(d_model)
														
 
															+        self.norm2 = nn.LayerNorm(d_model)
														
 
															+        self.dropout1 = nn.Dropout(dropout)
														
 
															+        self.dropout2 = nn.Dropout(dropout)
														
 
															+
														
 
															+        self.activation = torch.nn.GELU()
														
 
															+
														
 
															+    def forward(self, src):
														
 
															+        src.transpose_(0, 1)
														
 
															+        src2 = self.self_attn(src, src, src)[0]
														
 
															+        src = src + self.dropout1(src2)
														
 
															+        src = self.norm1(src)
														
 
															+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
														
 
															+        src = src + self.dropout2(src2)
														
 
															+        src = self.norm2(src)
														
 
															+        src.transpose_(0, 1)
														
 
															+        return src
														
 
															+
														
 
															+
														
 
															+class NopExpert(nn.Sequential):
														
 
															+    def __init__(self, hid_dim):
														
 
															+        super().__init__()
														
 
															+        self.w = nn.Parameter(torch.zeros(0), requires_grad=True)
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        return x.clone()
														
--- a/hivemind/server/layers/dropout.py
+++ b/hivemind/server/layers/dropout.py
@@ -1,5 +1,6 @@
 
															 import torch.autograd
														
 
															 import torch.nn as nn
														
 
															+from torch import nn as nn
														
 
															 class DeterministicDropoutFunction(torch.autograd.Function):
														
@@ -29,3 +30,16 @@ class DeterministicDropout(nn.Module):
 
															             return DeterministicDropoutFunction.apply(x, self.keep_prob, mask)
														
 
															         else:
														
 
															             return x
														
 
															+
														
 
															+
														
 
															+class DeterministicDropoutNetwork(nn.Module):
														
 
															+    def __init__(self, hid_dim, dropout_prob):
														
 
															+        super().__init__()
														
 
															+        self.linear_in = nn.Linear(hid_dim, 2 * hid_dim)
														
 
															+        self.activation = nn.ReLU()
														
 
															+        self.dropout = DeterministicDropout(dropout_prob)
														
 
															+        self.linear_out = nn.Linear(2 * hid_dim, hid_dim)
														
 
															+
														
 
															+    def forward(self, x, mask):
														
 
															+        x = self.linear_in(self.dropout(x, mask))
														
 
															+        return self.linear_out(self.activation(x))
														
--- a/hivemind/utils/threading.py
+++ b/hivemind/utils/threading.py
@@ -1,6 +1,10 @@
 
															 import os
														
 
															 from concurrent.futures import Future, ThreadPoolExecutor
														
 
															+from hivemind.utils import get_logger
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															 EXECUTOR_PID, GLOBAL_EXECUTOR = None, None
														
@@ -11,3 +15,16 @@ def run_in_background(func: callable, *args, **kwargs) -> Future:
 
															         GLOBAL_EXECUTOR = ThreadPoolExecutor(max_workers=os.environ.get("HIVEMIND_THREADS", float('inf')))
														
 
															         EXECUTOR_PID = os.getpid()
														
 
															     return GLOBAL_EXECUTOR.submit(func, *args, **kwargs)
														
 
															+
														
 
															+
														
 
															+def increase_file_limit(new_soft=2 ** 15, new_hard=2 ** 15):
														
 
															+    """ Increase the maximum number of open files. On Linux, this allows spawning more processes/threads. """
														
 
															+    try:
														
 
															+        import resource  # local import to avoid ImportError for Windows users
														
 
															+        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
														
 
															+        new_soft = max(soft, new_soft)
														
 
															+        new_hard = max(hard, new_hard)
														
 
															+        logger.info(f"Increasing file limit: soft {soft}=>{new_soft}, hard {hard}=>{new_hard}")
														
 
															+        return resource.setrlimit(resource.RLIMIT_NOFILE, (new_soft, new_hard))
														
 
															+    except Exception as e:
														
 
															+        logger.warning(f"Failed to increase file limit: {e}")
														
--- a/scripts/config.yml
+++ b/scripts/config.yml
@@ -1,12 +1,10 @@
 
															-listen_on: 0.0.0.0:* #'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6
														
 
															-num_experts: 1 #run this many identical experts
														
 
															-expert_cls: ffn #expert type from test_utils.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'.
														
 
															-hidden_dim: 1024 #main dimension for expert_cls
														
 
															-expert_prefix: expert #all expert uids will be {expert_prefix}.{index}
														
 
															-expert_offset: 0 #expert uid will use indices in range(expert_offset, expert_offset + num_experts)
														
 
															-max_batch_size: 16384 #total num examples in the same batch will not exceed this value
														
 
															-optimizer: adam #if specified, all optimizers use learning rate=0
														
 
															-no_dht: True #if specified, the server will not be attached to a dht
														
 
															-initial_peers: "[]" #a list of peers that will introduce this node to the dht, e.g. [("1.2.3.4", 1337), ("127.0.0.1", 4321)]
														
 
															-#dht_port: none #DHT node will listen on this port
														
 
															-increase_file_limit: True #On *nix, this will increase the max number of processes a server can spawn before hitting "Too many open files"; Use at your own risk.
														
 
															+listen_on: 0.0.0.0:*
														
 
															+num_experts: 16
														
 
															+expert_cls: ffn
														
 
															+hidden_dim: 1024
														
 
															+expert_pattern: expert.[0:4].[0:4]
														
 
															+max_batch_size: 16384
														
 
															+optimizer: adam
														
 
															+no_dht: True
														
 
															+initial_peers: "[]"
														
 
															+increase_file_limit: True
														
--- a/scripts/run_server.py
+++ b/scripts/run_server.py
@@ -6,17 +6,19 @@ import resource
 
															 import torch
														
 
															 from hivemind.server import Server
														
 
															+from hivemind.utils.threading import increase_file_limit
														
 
															-if __name__ == '__main__':
														
 
															+
														
 
															+def main():
														
 
															     # fmt:off
														
 
															     parser = configargparse.ArgParser(default_config_files=["config.yml"])
														
 
															     parser.add('-c', '--config', required=False, is_config_file=True, help='config file path')
														
 
															     parser.add_argument('--listen_on', type=str, default='0.0.0.0:*', required=False,
														
 
															                         help="'localhost' for local connections only, '0.0.0.0' for ipv4 '::' for ipv6")
														
 
															-    parser.add_argument('--num_experts', type=int, default=None, required=False, help="run this many experts")
														
 
															-    parser.add_argument('--expert_pattern', type=str, default=None, required=False, help='all expert uids will follow'
														
 
															-                        ' this pattern, e.g. "myexpert.[0:256].[0:1024]" will sample random expert uids'
														
 
															-                        ' between myexpert.0.0 and myexpert.255.1023 . Use either num_experts and this or expert_uids')
														
 
															+    parser.add_argument('--num_experts', type=int, default=None, required=False, help="The number of experts to serve")
														
 
															+    parser.add_argument('--expert_pattern', type=str, default=None, required=False,
														
 
															+                        help='all expert uids will follow this pattern, e.g. "myexpert.[0:256].[0:1024]" will sample random expert uids'
														
 
															+                             ' between myexpert.0.0 and myexpert.255.1023 . Use either num_experts and this or expert_uids')
														
 
															     parser.add_argument('--expert_uids', type=str, nargs="*", default=None, required=False,
														
 
															                         help="specify the exact list of expert uids to create. Use either this or num_experts"
														
 
															                              " and expert_pattern, not both")
														
@@ -26,39 +28,39 @@ if __name__ == '__main__':
 
															     parser.add_argument('--num_handlers', type=int, default=None, required=False,
														
 
															                         help='server will use this many processes to handle incoming requests')
														
 
															     parser.add_argument('--max_batch_size', type=int, default=16384, required=False,
														
 
															-                        help='total num examples in the same batch will not exceed this value')
														
 
															+                        help='The total number of examples in the same batch will not exceed this value')
														
 
															     parser.add_argument('--device', type=str, default=None, required=False,
														
 
															                         help='all experts will use this device in torch notation; default: cuda if available else cpu')
														
 
															     parser.add_argument('--optimizer', type=str, default='adam', required=False, help='adam, sgd or none')
														
 
															     parser.add_argument('--no_dht', action='store_true', help='if specified, the server will not be attached to a dht')
														
 
															-    parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[], help='one or more peers'
														
 
															-                        ' that can welcome you to the dht, e.g. 1.2.3.4:1337 192.132.231.4:4321')
														
 
															+    parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[],
														
 
															+                        help='one or more peers that can welcome you to the dht, e.g. 1.2.3.4:1337 192.132.231.4:4321')
														
 
															     parser.add_argument('--dht_port', type=int, default=None, required=False, help='DHT node will listen on this port')
														
 
															-    parser.add_argument('--increase_file_limit', action='store_true', help='On *nix, this will increase the max number'
														
 
															-                        ' of processes a server can spawn before hitting "Too many open files"; Use at your own risk.')
														
 
															+    parser.add_argument('--increase_file_limit', action='store_true',
														
 
															+                        help='On *nix, this will increase the max number of processes '
														
 
															+                             'a server can spawn before hitting "Too many open files"; Use at your own risk.')
														
 
															     # fmt:on
														
 
															     args = vars(parser.parse_args())
														
 
															     args.pop('config', None)
														
 
															     optimizer = args.pop('optimizer')
														
 
															     if optimizer == 'adam':
														
 
															-        Optimizer = torch.optim.Adam
														
 
															+        optim_cls = torch.optim.Adam
														
 
															     elif optimizer == 'sgd':
														
 
															-        Optimizer = partial(torch.optim.SGD, lr=0.01)
														
 
															+        optim_cls = partial(torch.optim.SGD, lr=0.01)
														
 
															     elif optimizer == 'none':
														
 
															-        Optimizer = None
														
 
															+        optim_cls = None
														
 
															     else:
														
 
															-        raise ValueError("Optimizer must be adam, sgd or none")
														
 
															+        raise ValueError("optim_cls must be adam, sgd or none")
														
 
															     if args.pop('increase_file_limit'):
														
 
															-        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
														
 
															-        try:
														
 
															-            print("Setting open file limit to soft={}, hard={}".format(max(soft, 2 ** 15), max(hard, 2 ** 15)))
														
 
															-            resource.setrlimit(resource.RLIMIT_NOFILE, (max(soft, 2 ** 15), max(hard, 2 ** 15)))
														
 
															-        except:
														
 
															-            print("Could not increase open file limit, currently at soft={}, hard={}".format(soft, hard))
														
 
															+        increase_file_limit()
														
 
															     try:
														
 
															-        server = Server.create(**args, Optimizer=Optimizer, start=True, verbose=True)
														
 
															+        server = Server.create(**args, optim_cls=optim_cls, start=True, verbose=True)
														
 
															         server.join()
														
 
															     finally:
														
 
															         server.shutdown()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/setup.py
+++ b/setup.py
@@ -78,6 +78,9 @@ setup(
 
															         'Topic :: Software Development :: Libraries',
														
 
															         'Topic :: Software Development :: Libraries :: Python Modules',
														
 
															     ],
														
 
															+    entry_points={
														
 
															+        'console_scripts': ['hivemind-server = scripts.run_server:main', ]
														
 
															+    },
														
 
															     # What does your project relate to?
														
 
															     keywords='pytorch, deep learning, machine learning, gpu, distributed computing, volunteer computing, dht',
														
 
															 )
														
--- a/tests/benchmark_dht.py
+++ b/tests/benchmark_dht.py
@@ -1,11 +1,12 @@
 
															-import time
														
 
															 import argparse
														
 
															 import random
														
 
															+import time
														
 
															 from warnings import warn
														
 
															-import hivemind
														
 
															+
														
 
															 from tqdm import trange
														
 
															-from test_utils import increase_file_limit
														
 
															+import hivemind
														
 
															+from hivemind.utils.threading import increase_file_limit
														
 
															 def random_endpoint() -> hivemind.Endpoint:
														
--- a/tests/benchmark_throughput.py
+++ b/tests/benchmark_throughput.py
@@ -5,11 +5,12 @@ import sys
 
															 import time
														
 
															 import torch
														
 
															-from hivemind.server import layers
														
 
															-from test_utils import print_device_info, increase_file_limit
														
 
															+from test_utils import print_device_info
														
 
															 import hivemind
														
 
															 from hivemind import find_open_port
														
 
															+from hivemind.server import layers
														
 
															+from hivemind.utils.threading import increase_file_limit
														
 
															 def client_process(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches, backprop=True):
														
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -34,7 +34,7 @@ def test_call_many():
 
															     atol = 1e-6
														
 
															     with background_server(num_experts=5, device='cpu', expert_cls='ffn', num_handlers=8, hidden_dim=64,
														
 
															-                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															+                           optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															         inputs = torch.randn(4, 64, requires_grad=True)
														
 
															         inputs_clone = inputs.clone().detach().requires_grad_(True)
														
 
															         e0, e1, e2, e3, e4 = [hivemind.RemoteExpert(f'expert.{i}', server_endpoint) for i in range(5)]
														
@@ -76,7 +76,7 @@ def test_call_many():
 
															 def test_remote_module_call():
														
 
															     with background_server(num_experts=1, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=1024,
														
 
															-                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															+                           optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															         real_expert = hivemind.RemoteExpert('expert.0', server_endpoint)
														
 
															         fake_expert = hivemind.RemoteExpert('oiasfjiasjf', server_endpoint)
														
@@ -128,7 +128,7 @@ def test_determinism():
 
															     mask = torch.randint(0, 1, (32, 1024))
														
 
															     with background_server(num_experts=1, device='cpu', expert_cls='det_dropout', num_handlers=1,
														
 
															-                           Optimizer=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															+                           optim_cls=None, no_dht=True) as (server_endpoint, dht_endpoint):
														
 
															         expert = hivemind.RemoteExpert(uid=f'expert.0', endpoint=server_endpoint)
														
 
															         out = expert(xx, mask)
														
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -14,7 +14,7 @@ def test_training(port: Optional[int] = None, max_steps: int = 100, threshold: f
 
															     X_train, y_train = torch.tensor(dataset['data'], dtype=torch.float), torch.tensor(dataset['target'])
														
 
															     SGD = partial(torch.optim.SGD, lr=0.05)
														
 
															-    with background_server(num_experts=2, device='cpu', Optimzer=SGD, hidden_dim=64) as (server_endpoint, _):
														
 
															+    with background_server(num_experts=2, device='cpu', optim_cls=SGD, hidden_dim=64) as (server_endpoint, _):
														
 
															         expert1 = RemoteExpert('expert.0', server_endpoint)
														
 
															         expert2 = RemoteExpert('expert.1', server_endpoint)
														
 
															         model = nn.Sequential(expert2, nn.Tanh(), expert1, nn.Linear(64, 10))
														
--- a/tests/test_utils/__init__.py
+++ b/tests/test_utils/__init__.py
@@ -1,5 +1,3 @@
 
															-from warnings import warn
														
 
															-
														
 
															 import torch
														
@@ -14,14 +12,3 @@ def print_device_info(device=None):
 
															         print('Memory Usage:')
														
 
															         print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
														
 
															         print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024 ** 3, 1), 'GB')
														
 
															-
														
 
															-
														
 
															-def increase_file_limit(new_soft=2 ** 15, new_hard=2 ** 15):
														
 
															-    """ Increase the maximum number of open files. On Linux, this allows spawning more processes/threads. """
														
 
															-    try:
														
 
															-        import resource  # note: local import to avoid ImportError for those who don't have it
														
 
															-        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
														
 
															-        print(f"Increasing file limit - soft {soft}=>{new_soft}, hard {hard}=>{new_hard}")
														
 
															-        return resource.setrlimit(resource.RLIMIT_NOFILE, (max(soft, new_soft), max(hard, new_hard)))
														
 
															-    except Exception as e:
														
 
															-        warn(f"Failed to increase file limit: {e}")