3 lat temu · 11a02608a3
--- a/.github/workflows/run-benchmarks.yml
+++ b/.github/workflows/run-benchmarks.yml
@@ -26,6 +26,9 @@ jobs:
 
															           python -m pip install --upgrade pip
														
 
															           pip install -r requirements.txt
														
 
															           pip install -r requirements-dev.txt
														
 
															+      - name: Build bitsandbytes
														
 
															+        run: |
														
 
															+          pip install bitsandbytes==0.32.3
														
 
															       - name: Build hivemind
														
 
															         run: |
														
 
															           pip install .
														
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -29,6 +29,9 @@ jobs:
 
															           python -m pip install --upgrade pip
														
 
															           pip install -r requirements.txt
														
 
															           pip install -r requirements-dev.txt
														
 
															+      - name: Build bitsandbytes
														
 
															+        run: |
														
 
															+          pip install bitsandbytes==0.32.3
														
 
															       - name: Build hivemind
														
 
															         run: |
														
 
															           pip install .
														
@@ -88,6 +91,9 @@ jobs:
 
															           python -m pip install --upgrade pip
														
 
															           pip install -r requirements.txt
														
 
															           pip install -r requirements-dev.txt
														
 
															+      - name: Build bitsandbytes
														
 
															+        run: |
														
 
															+          pip install bitsandbytes==0.32.3
														
 
															       - name: Build hivemind
														
 
															         run: |
														
 
															           pip install -e . --no-use-pep517
														
--- a/README.md
+++ b/README.md
@@ -53,6 +53,10 @@ If your versions of Python and PyTorch match the requirements, you can install h
 
															 pip install hivemind
														
 
															 ```
														
 
															+Also, if you want to use blockwise 8-bit compression from [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) 
														
 
															+during data transfer, you can install it with `pip install hivemind[bitsandbytes]`. 
														
 
															+After that, you can use the `BlockwiseQuantization` class in [hivemind.compression](./hivemind/compression)
														
 
															+
														
 
															 ### From source
														
 
															 To install hivemind from source, simply run the following:
														
--- a/benchmarks/benchmark_tensor_compression.py
+++ b/benchmarks/benchmark_tensor_compression.py
@@ -11,26 +11,37 @@ use_hivemind_log_handler("in_root_logger")
 
															 logger = get_logger(__name__)
														
 
															-def benchmark_compression(tensor: torch.Tensor, compression_type: CompressionType) -> float:
														
 
															+def benchmark_compression(tensor: torch.Tensor, compression_type: CompressionType) -> [float, float, int]:
														
 
															     t = time.time()
														
 
															-    deserialize_torch_tensor(serialize_torch_tensor(tensor, compression_type))
														
 
															-    return time.time() - t
														
 
															+    serialized = serialize_torch_tensor(tensor, compression_type)
														
 
															+    result = deserialize_torch_tensor(serialized)
														
 
															+    return time.time() - t, (tensor - result).square().mean(), serialized.ByteSize()
														
 
															 if __name__ == "__main__":
														
 
															     parser = argparse.ArgumentParser()
														
 
															-    parser.add_argument("--size", type=int, default=10000000, required=False)
														
 
															+    parser.add_argument("--size", type=int, default=10_000_000, required=False)
														
 
															     parser.add_argument("--seed", type=int, default=7348, required=False)
														
 
															     parser.add_argument("--num_iters", type=int, default=30, required=False)
														
 
															     args = parser.parse_args()
														
 
															     torch.manual_seed(args.seed)
														
 
															-    X = torch.randn(args.size)
														
 
															+    X = torch.randn(args.size, dtype=torch.float32)
														
 
															     for name, compression_type in CompressionType.items():
														
 
															-        tm = 0
														
 
															+        total_time = 0
														
 
															+        compression_error = 0
														
 
															+        total_size = 0
														
 
															         for i in range(args.num_iters):
														
 
															-            tm += benchmark_compression(X, compression_type)
														
 
															-        tm /= args.num_iters
														
 
															-        logger.info(f"Compression type: {name}, time: {tm}")
														
 
															+            iter_time, iter_distortion, size = benchmark_compression(X, compression_type)
														
 
															+            total_time += iter_time
														
 
															+            compression_error += iter_distortion
														
 
															+            total_size += size
														
 
															+        total_time /= args.num_iters
														
 
															+        compression_error /= args.num_iters
														
 
															+        total_size /= args.num_iters
														
 
															+        logger.info(
														
 
															+            f"Compression type: {name}, time: {total_time:.5f}, compression error: {compression_error:.5f}, "
														
 
															+            f"size: {int(total_size):d}"
														
 
															+        )
														
--- a/hivemind/compression/__init__.py
+++ b/hivemind/compression/__init__.py
@@ -5,7 +5,7 @@ Compression strategies that reduce the network communication in .averaging, .opt
 
															 from hivemind.compression.adaptive import PerTensorCompression, RoleAdaptiveCompression, SizeAdaptiveCompression
														
 
															 from hivemind.compression.base import CompressionBase, CompressionInfo, NoCompression, TensorRole
														
 
															 from hivemind.compression.floating import Float16Compression, ScaledFloat16Compression
														
 
															-from hivemind.compression.quantization import Quantile8BitQuantization, Uniform8BitQuantization
														
 
															+from hivemind.compression.quantization import BlockwiseQuantization, Quantile8BitQuantization, Uniform8BitQuantization
														
 
															 from hivemind.compression.serialization import (
														
 
															     deserialize_tensor_stream,
														
 
															     deserialize_torch_tensor,
														
--- a/hivemind/compression/quantization.py
+++ b/hivemind/compression/quantization.py
@@ -1,5 +1,7 @@
 
															+import importlib.util
														
 
															 import math
														
 
															 import os
														
 
															+import warnings
														
 
															 from abc import ABC, abstractmethod
														
 
															 from concurrent.futures import ThreadPoolExecutor
														
 
															 from typing import Tuple
														
@@ -7,6 +9,10 @@ from typing import Tuple
 
															 import numpy as np
														
 
															 import torch
														
 
															+if importlib.util.find_spec("bitsandbytes") is not None:
														
 
															+    warnings.filterwarnings("ignore", module="bitsandbytes", category=UserWarning)
														
 
															+    from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
														
 
															+
														
 
															 from hivemind.compression.base import CompressionBase, CompressionInfo
														
 
															 from hivemind.proto import runtime_pb2
														
@@ -112,3 +118,60 @@ def quantile_qq_approximation(array: np.ndarray, n_quantiles: int, min_chunk_siz
 
															     for job in jobs:
														
 
															         job.result()
														
 
															     return np.quantile(partition_quantiles, quantiles)
														
 
															+
														
 
															+
														
 
															+BNB_MISSING_MESSAGE = """BlockwiseQuantization requires bitsandbytes to function properly. 
														
 
															+Please install it with `pip install bitsandbytes` 
														
 
															+or using the instruction from https://github.com/TimDettmers/bitsandbytes."""
														
 
															+
														
 
															+
														
 
															+class BlockwiseQuantization(Quantization):
														
 
															+    compression_type = runtime_pb2.BLOCKWISE_8BIT
														
 
															+    codebook_dtype, indices_dtype = np.float32, np.uint8
														
 
															+
														
 
															+    def quantize(
														
 
															+        self, tensor: torch.Tensor, allow_inplace: bool = False
														
 
															+    ) -> Tuple[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
														
 
															+        try:
														
 
															+            quantized, (absmax, codebook) = quantize_blockwise(tensor)
														
 
															+        except NameError:
														
 
															+            raise ImportError(BNB_MISSING_MESSAGE)
														
 
															+        return quantized.numpy(), (absmax.numpy(), codebook.numpy())
														
 
															+
														
 
															+    def compress(self, tensor: torch.Tensor, info: CompressionInfo, allow_inplace: bool = False) -> runtime_pb2.Tensor:
														
 
															+        quantized, (absmax, codebook) = self.quantize(tensor.detach(), allow_inplace=allow_inplace)
														
 
															+
														
 
															+        serialized_data = (
														
 
															+            np.int64(len(absmax)).tobytes(),
														
 
															+            np.int64(len(codebook)).tobytes(),
														
 
															+            absmax.tobytes(),
														
 
															+            codebook.tobytes(),
														
 
															+            quantized.tobytes(),
														
 
															+        )
														
 
															+
														
 
															+        return runtime_pb2.Tensor(
														
 
															+            buffer=b"".join(serialized_data),
														
 
															+            size=tensor.shape,
														
 
															+            requires_grad=tensor.requires_grad,
														
 
															+            dtype=tensor.numpy().dtype.name,
														
 
															+            compression=self.compression_type,
														
 
															+        )
														
 
															+
														
 
															+    def extract(self, serialized_tensor: runtime_pb2.Tensor) -> torch.Tensor:
														
 
															+        absmax_size = int(np.frombuffer(serialized_tensor.buffer, count=1, dtype=np.int64))
														
 
															+        codebook_size = int(np.frombuffer(serialized_tensor.buffer, offset=8, count=1, dtype=np.int64))
														
 
															+        absmax = np.frombuffer(serialized_tensor.buffer, offset=16, count=absmax_size, dtype=self.codebook_dtype)
														
 
															+        codebook = np.frombuffer(
														
 
															+            serialized_tensor.buffer, offset=16 + absmax.nbytes, count=codebook_size, dtype=self.codebook_dtype
														
 
															+        )
														
 
															+        quantized = np.frombuffer(
														
 
															+            serialized_tensor.buffer, offset=16 + absmax.nbytes + codebook.nbytes, dtype=self.indices_dtype
														
 
															+        )
														
 
															+
														
 
															+        absmax = torch.as_tensor(absmax)
														
 
															+        codebook = torch.as_tensor(codebook)
														
 
															+        quantized = torch.as_tensor(quantized).reshape(tuple(serialized_tensor.size))
														
 
															+        try:
														
 
															+            return dequantize_blockwise(quantized, (absmax, codebook))
														
 
															+        except NameError:
														
 
															+            raise ImportError(BNB_MISSING_MESSAGE)
														
--- a/hivemind/compression/serialization.py
+++ b/hivemind/compression/serialization.py
@@ -6,21 +6,22 @@ import torch
 
															 from hivemind.compression.base import CompressionBase, CompressionInfo, NoCompression
														
 
															 from hivemind.compression.floating import Float16Compression, ScaledFloat16Compression
														
 
															-from hivemind.compression.quantization import Quantile8BitQuantization, Uniform8BitQuantization
														
 
															+from hivemind.compression.quantization import BlockwiseQuantization, Quantile8BitQuantization, Uniform8BitQuantization
														
 
															 from hivemind.proto import runtime_pb2
														
 
															 from hivemind.utils.streaming import combine_from_streaming
														
 
															-BASE_COMPRESSION_TYPES: Dict[str, CompressionBase] = dict(
														
 
															+_BASE_COMPRESSION_TYPES: Dict[str, CompressionBase] = dict(
														
 
															     NONE=NoCompression(),
														
 
															     FLOAT16=Float16Compression(),
														
 
															     MEANSTD_16BIT=ScaledFloat16Compression(),
														
 
															     QUANTILE_8BIT=Quantile8BitQuantization(),
														
 
															     UNIFORM_8BIT=Uniform8BitQuantization(),
														
 
															+    BLOCKWISE_8BIT=BlockwiseQuantization(),
														
 
															 )
														
 
															 for key in runtime_pb2.CompressionType.keys():
														
 
															-    assert key in BASE_COMPRESSION_TYPES, f"Compression type {key} does not have a registered deserializer"
														
 
															-    actual_compression_type = BASE_COMPRESSION_TYPES[key].compression_type
														
 
															+    assert key in _BASE_COMPRESSION_TYPES, f"Compression type {key} does not have a registered deserializer"
														
 
															+    actual_compression_type = _BASE_COMPRESSION_TYPES[key].compression_type
														
 
															     assert (
														
 
															         runtime_pb2.CompressionType.Name(actual_compression_type) == key
														
 
															     ), f"Compression strategy for {key} has inconsistent type"
														
@@ -35,14 +36,14 @@ def serialize_torch_tensor(
 
															 ) -> runtime_pb2.Tensor:
														
 
															     """Serialize a given tensor into a protobuf message using the specified compression strategy"""
														
 
															     assert tensor.device == torch.device("cpu")
														
 
															-    compression = BASE_COMPRESSION_TYPES[runtime_pb2.CompressionType.Name(compression_type)]
														
 
															+    compression = _BASE_COMPRESSION_TYPES[runtime_pb2.CompressionType.Name(compression_type)]
														
 
															     info = info or CompressionInfo.from_tensor(tensor, **kwargs)
														
 
															     return compression.compress(tensor, info, allow_inplace)
														
 
															 def deserialize_torch_tensor(serialized_tensor: runtime_pb2.Tensor) -> torch.Tensor:
														
 
															     """Restore a pytorch tensor from a protobuf message"""
														
 
															-    compression = BASE_COMPRESSION_TYPES[runtime_pb2.CompressionType.Name(serialized_tensor.compression)]
														
 
															+    compression = _BASE_COMPRESSION_TYPES[runtime_pb2.CompressionType.Name(serialized_tensor.compression)]
														
 
															     return compression.extract(serialized_tensor).requires_grad_(serialized_tensor.requires_grad)
														
--- a/hivemind/proto/runtime.proto
+++ b/hivemind/proto/runtime.proto
@@ -26,6 +26,7 @@ enum CompressionType{
 
															   FLOAT16 = 2;
														
 
															   QUANTILE_8BIT = 3;
														
 
															   UNIFORM_8BIT = 4;
														
 
															+  BLOCKWISE_8BIT = 5;
														
 
															 }
														
 
															 message Tensor {
														
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,6 @@ EXECUTABLES = {
 
															     "p2pd": "1252a2a2095040cef8e317f5801df8b8c93559711783a2496a0aff2f3e177e39",
														
 
															 }
														
 
															-
														
 
															 here = os.path.abspath(os.path.dirname(__file__))
														
@@ -140,7 +139,9 @@ with open("requirements-dev.txt") as dev_requirements_file:
 
															 with open("requirements-docs.txt") as docs_requirements_file:
														
 
															     extras["docs"] = list(map(str, parse_requirements(docs_requirements_file)))
														
 
															-extras["all"] = extras["dev"] + extras["docs"]
														
 
															+extras["bitsandbytes"] = ["bitsandbytes==0.32.3"]
														
 
															+
														
 
															+extras["all"] = extras["dev"] + extras["docs"] + extras["bitsandbytes"]
														
 
															 setup(
														
 
															     name="hivemind",
														
--- a/tests/test_cli_scripts.py
+++ b/tests/test_cli_scripts.py
@@ -35,7 +35,7 @@ def test_dht_connection_successful():
 
															         dht_client_proc.stderr.readline()
														
 
															     first_report_msg = dht_client_proc.stderr.readline()
														
 
															-    assert "2 DHT nodes (including this one) are in the local routing table" in first_report_msg
														
 
															+    assert "2 DHT nodes (including this one) are in the local routing table" in first_report_msg, first_report_msg
														
 
															     # ensure we get the output of dht_proc after the start of dht_client_proc
														
 
															     sleep(dht_refresh_period)
														
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -38,6 +38,8 @@ def test_tensor_compression(size=(128, 128, 64), alpha=5e-08, beta=0.0008):
 
															     assert error.square().mean() < beta
														
 
															     error = deserialize_torch_tensor(serialize_torch_tensor(X, CompressionType.UNIFORM_8BIT)) - X
														
 
															     assert error.square().mean() < beta
														
 
															+    error = deserialize_torch_tensor(serialize_torch_tensor(X, CompressionType.BLOCKWISE_8BIT)) - X
														
 
															+    assert error.square().mean() < beta
														
 
															     zeros = torch.zeros(5, 5)
														
 
															     for compression_type in CompressionType.values():
														
--- a/tests/test_moe.py
+++ b/tests/test_moe.py
@@ -162,7 +162,7 @@ def test_remote_module_call(hidden_dim=16):
 
															         # check that the server is still alive after processing a malformed request
														
 
															         out3_yet_again = real_expert(dummy_x[1:])
														
 
															-        assert torch.allclose(out3_yet_again, out3[1:])
														
 
															+        assert torch.allclose(out3_yet_again, out3[1:], atol=1e-5, rtol=0)
														
 
															 @pytest.mark.forked
														
--- a/tests/test_start_server.py
+++ b/tests/test_start_server.py
@@ -1,5 +1,6 @@
 
															 import os
														
 
															 import re
														
 
															+from functools import partial
														
 
															 from subprocess import PIPE, Popen
														
 
															 from tempfile import TemporaryDirectory
														
@@ -10,10 +11,11 @@ def test_background_server_identity_path():
 
															     with TemporaryDirectory() as tempdir:
														
 
															         id_path = os.path.join(tempdir, "id")
														
 
															-        with background_server(num_experts=1, identity_path=id_path) as server_info_1, background_server(
														
 
															-            num_experts=1, identity_path=id_path
														
 
															-        ) as server_info_2, background_server(num_experts=1, identity_path=None) as server_info_3:
														
 
															+        server_runner = partial(background_server, num_experts=1, device="cpu", hidden_dim=1)
														
 
															+        with server_runner(identity_path=id_path) as server_info_1, server_runner(
														
 
															+            identity_path=id_path
														
 
															+        ) as server_info_2, server_runner(identity_path=None) as server_info_3:
														
 
															             assert server_info_1.peer_id == server_info_2.peer_id
														
 
															             assert server_info_1.peer_id != server_info_3.peer_id
														
 
															             assert server_info_3.peer_id == server_info_3.peer_id
														
@@ -33,9 +35,11 @@ def test_cli_run_server_identity_path():
 
															         )
														
 
															         # Skip line "Generating new identity (libp2p private key) in {path to file}"
														
 
															+        server_1_proc.stderr.readline()
														
 
															         line = server_1_proc.stderr.readline()
														
 
															-        line = server_1_proc.stderr.readline()
														
 
															-        addrs_1 = set(re.search(pattern, line).group(1).split(", "))
														
 
															+        addrs_pattern_result = re.search(pattern, line)
														
 
															+        assert addrs_pattern_result is not None, line
														
 
															+        addrs_1 = set(addrs_pattern_result.group(1).split(", "))
														
 
															         ids_1 = set(a.split("/")[-1] for a in addrs_1)
														
 
															         assert len(ids_1) == 1
														
@@ -48,7 +52,9 @@ def test_cli_run_server_identity_path():
 
															         )
														
 
															         line = server_2_proc.stderr.readline()
														
 
															-        addrs_2 = set(re.search(pattern, line).group(1).split(", "))
														
 
															+        addrs_pattern_result = re.search(pattern, line)
														
 
															+        assert addrs_pattern_result is not None, line
														
 
															+        addrs_2 = set(addrs_pattern_result.group(1).split(", "))
														
 
															         ids_2 = set(a.split("/")[-1] for a in addrs_2)
														
 
															         assert len(ids_2) == 1
														
@@ -61,7 +67,9 @@ def test_cli_run_server_identity_path():
 
															         )
														
 
															         line = server_3_proc.stderr.readline()
														
 
															-        addrs_3 = set(re.search(pattern, line).group(1).split(", "))
														
 
															+        addrs_pattern_result = re.search(pattern, line)
														
 
															+        assert addrs_pattern_result is not None, line
														
 
															+        addrs_3 = set(addrs_pattern_result.group(1).split(", "))
														
 
															         ids_3 = set(a.split("/")[-1] for a in addrs_3)
														
 
															         assert len(ids_3) == 1