3 years ago · e2711a033b
--- a/.github/workflows/check-style.yaml
+++ b/.github/workflows/check-style.yaml
@@ -0,0 +1,26 @@
 
															+name: Check style
														
 
															+
														
 
															+on:
														
 
															+  push:
														
 
															+    branches: [ master ]
														
 
															+  pull_request:
														
 
															+
														
 
															+jobs:
														
 
															+  black:
														
 
															+    runs-on: ubuntu-latest
														
 
															+    steps:
														
 
															+      - uses: actions/checkout@v2
														
 
															+      - uses: psf/black@stable
														
 
															+        with:
														
 
															+          options: "--check --diff"
														
 
															+          version: "22.3.0"
														
 
															+  isort:
														
 
															+    runs-on: ubuntu-latest
														
 
															+    steps:
														
 
															+      - uses: actions/checkout@v2
														
 
															+      - uses: actions/setup-python@v2
														
 
															+        with:
														
 
															+          python-version: 3.8
														
 
															+      - uses: isort/isort-action@master
														
 
															+        with:
														
 
															+          isortVersion: "5.10.1"
														
--- a/.github/workflows/run-tests.yaml
+++ b/.github/workflows/run-tests.yaml
@@ -0,0 +1,89 @@
 
															+name: Tests
														
 
															+
														
 
															+on:
														
 
															+  push:
														
 
															+    branches: [ master ]
														
 
															+  pull_request:
														
 
															+
														
 
															+jobs:
														
 
															+  convert-model:
														
 
															+    runs-on: ubuntu-latest
														
 
															+    env:
														
 
															+      BLOOM_TESTING_WRITE_TOKEN: ${{ secrets.BLOOM_TESTING_WRITE_TOKEN }}
														
 
															+    timeout-minutes: 15
														
 
															+    steps:
														
 
															+      - uses: actions/checkout@v2
														
 
															+      - name: Set up Python
														
 
															+        uses: actions/setup-python@v2
														
 
															+        with:
														
 
															+          python-version: 3.9
														
 
															+      - name: Cache dependencies
														
 
															+        uses: actions/cache@v2
														
 
															+        with:
														
 
															+          path: ~/.cache/pip
														
 
															+          key: Key-v1-py3.9-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
														
 
															+      - name: Install dependencies
														
 
															+        run: |
														
 
															+          python -m pip install --upgrade pip
														
 
															+          pip install -r requirements.txt
														
 
															+      - name: Delete previous model, if exists
														
 
															+        run: |
														
 
															+          python -c "from huggingface_hub import delete_repo; delete_repo(token='$BLOOM_TESTING_WRITE_TOKEN', \
														
 
															+          name='test-bloomd-350m-$GITHUB_HEAD_REF', organization='bloom-testing')" || true
														
 
															+      - name: Convert model and push to hub
														
 
															+        run: |
														
 
															+          python -m cli.convert_model --model bigscience/bloom-350m  --output_path ./converted_model \
														
 
															+            --output_repo bloom-testing/test-bloomd-350m-$GITHUB_HEAD_REF --use_auth_token $BLOOM_TESTING_WRITE_TOKEN
														
 
															+
														
 
															+
														
 
															+  run-tests:
														
 
															+    runs-on: ubuntu-latest
														
 
															+    needs: convert-model
														
 
															+    strategy:
														
 
															+      matrix:
														
 
															+        python-version: [ 3.7, 3.8, 3.9 ]
														
 
															+      fail-fast: false
														
 
															+    timeout-minutes: 15
														
 
															+    steps:
														
 
															+      - uses: actions/checkout@v2
														
 
															+      - name: Set up Python
														
 
															+        uses: actions/setup-python@v2
														
 
															+        with:
														
 
															+          python-version: ${{ matrix.python-version }}
														
 
															+      - name: Cache dependencies
														
 
															+        uses: actions/cache@v2
														
 
															+        with:
														
 
															+          path: ~/.cache/pip
														
 
															+          key: Key-v1-${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
														
 
															+      - name: Install dependencies
														
 
															+        run: |
														
 
															+          python -m pip install --upgrade pip
														
 
															+          pip install -r requirements.txt
														
 
															+          pip install -r requirements-dev.txt
														
 
															+      - name: Test
														
 
															+        run: |
														
 
															+          export MODEL_NAME=bloom-testing/test-bloomd-350m-$GITHUB_HEAD_REF
														
 
															+          python -m cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:12 \
														
 
															+            --torch_dtype float32 --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 --throughput 1 &
														
 
															+          SERVER1_PID=$!
														
 
															+          
														
 
															+          export INITIAL_PEERS=/ip4/127.0.0.1/tcp/31337/p2p/QmS9KwZptnVdB9FFV7uGgaTq4sEKBwcYeKZDfSpyKDUd1g
														
 
															+          # ^-- server 1 multiaddr is determined by --identity and --host_maddrs
														
 
															+          
														
 
															+          python -m cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 12:24 \
														
 
															+            --torch_dtype float32 --initial_peers $INITIAL_PEERS --throughput 1 &> server2.log &
														
 
															+          SERVER2_PID=$!
														
 
															+
														
 
															+          sleep 30  # wait for server to download layers
														
 
															+          
														
 
															+          # test individual blocks
														
 
															+          export PYTHONPATH=.
														
 
															+          BLOCK_UID=$MODEL_NAME.0 REF_NAME=$MODEL_NAME REF_INDEX=0 pytest tests/test_block_exact_match.py
														
 
															+          BLOCK_UID=$MODEL_NAME.19 REF_NAME=$MODEL_NAME REF_INDEX=19 pytest tests/test_block_exact_match.py
														
 
															+
														
 
															+          REF_NAME=$MODEL_NAME pytest tests/test_chained_calls.py
														
 
															+          
														
 
															+          REF_NAME=bigscience/bloom-350m pytest tests/test_full_model.py
														
 
															+          
														
 
															+          kill -s SIGINT $SERVER1_PID $SERVER2_PID
														
 
															+          echo "Done!"
														
--- a/cli/convert_model.py
+++ b/cli/convert_model.py
@@ -10,8 +10,9 @@ from huggingface_hub import Repository
 
															 from tqdm.auto import tqdm
														
 
															 from src import BloomModel
														
 
															+from src.bloom.from_pretrained import BLOCK_BRANCH_PREFIX, CLIENT_BRANCH
														
 
															 from src.client import DistributedBloomConfig
														
 
															-from src.bloom.from_pretrained import CLIENT_BRANCH, BLOCK_BRANCH_PREFIX
														
 
															+
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
--- a/cli/speed_test.py
+++ b/cli/speed_test.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,10 @@
 
															+[tool.black]
														
 
															+line-length = 120
														
 
															+required-version = "22.3.0"
														
 
															+
														
 
															+[tool.isort]
														
 
															+profile = "black"
														
 
															+line_length = 120
														
 
															+combine_as_imports = true
														
 
															+combine_star = true
														
 
															+known_local_folder = ["tests", "cli"]
														
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -0,0 +1,6 @@
 
															+pytest==6.2.5  # see https://github.com/pytest-dev/pytest/issues/9621
														
 
															+pytest-forked
														
 
															+pytest-asyncio==0.16.0
														
 
															+black==22.3.0
														
 
															+isort==5.10.1
														
 
															+psutil
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
 
															+torch==1.12.0
														
 
															+accelerate==0.10.0
														
 
															+huggingface-hub==0.7.0
														
 
															+bitsandbytes-cuda113==0.26.0
														
 
															+https://github.com/learning-at-home/hivemind/archive/d42c70331da43667da6d9020666df54806d8b561.zip
														
 
															+https://github.com/huggingface/transformers/archive/6589e510fa4e6c442059de2fab84752535de9b23.zip
														
--- a/src/bloom/block.py
+++ b/src/bloom/block.py
@@ -9,8 +9,15 @@ import torch
 
															 import torch.nn as nn
														
 
															 import torch.nn.quantized.dynamic.modules.linear
														
 
															-from src.bloom.ops import (BloomGelu, BloomScaledSoftmax, attention_mask_func, build_alibi_tensor, dropout_add,
														
 
															-                           pre_process_alibi_for_pad, split_tensor_along_last_dim)
														
 
															+from src.bloom.ops import (
														
 
															+    BloomGelu,
														
 
															+    BloomScaledSoftmax,
														
 
															+    attention_mask_func,
														
 
															+    build_alibi_tensor,
														
 
															+    dropout_add,
														
 
															+    pre_process_alibi_for_pad,
														
 
															+    split_tensor_along_last_dim,
														
 
															+)
														
 
															 class BloomAttention(nn.Module):
														
--- a/src/bloom/model.py
+++ b/src/bloom/model.py
@@ -10,14 +10,16 @@ import torch.nn.functional as F
 
															 import torch.utils.checkpoint
														
 
															 from hivemind import use_hivemind_log_handler
														
 
															 from torch import nn
														
 
															-from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss, LayerNorm
														
 
															-from transformers.file_utils import (add_code_sample_docstrings, add_start_docstrings,
														
 
															-                                     add_start_docstrings_to_model_forward)
														
 
															+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
														
 
															+from transformers.file_utils import (
														
 
															+    add_code_sample_docstrings,
														
 
															+    add_start_docstrings,
														
 
															+    add_start_docstrings_to_model_forward,
														
 
															+)
														
 
															 from transformers.modeling_outputs import (
														
 
															     BaseModelOutputWithPastAndCrossAttentions,
														
 
															     CausalLMOutputWithCrossAttentions,
														
 
															     SequenceClassifierOutputWithPast,
														
 
															-    TokenClassifierOutput,
														
 
															 )
														
 
															 from transformers.modeling_utils import PreTrainedModel
														
 
															 from transformers.models.bloom.configuration_bloom import BloomConfig
														
@@ -445,12 +447,27 @@ class LMHead(nn.Module):
 
															         self.word_embeddings = word_embeddings
														
 
															         self.chunk_size = config.chunk_size_for_efficient_fp16_on_cpu
														
 
															+    @property
														
 
															+    def in_features(self) -> int:
														
 
															+        return self.word_embeddings.num_embeddings
														
 
															+
														
 
															+    @property
														
 
															+    def out_features(self) -> int:
														
 
															+        return self.word_embeddings.embedding_dim
														
 
															+
														
 
															+    @property
														
 
															+    def weight(self):
														
 
															+        return self.word_embeddings.weight
														
 
															+
														
 
															+    @property
														
 
															+    def bias(self):
														
 
															+        return None
														
 
															+
														
 
															     def forward(self, hidden_states):
														
 
															         word_embeddings = self.word_embeddings.weight
														
 
															-        
														
 
															+
														
 
															         # We use 'chunked_forward' only when embeddings are in half-precision on CPU.
														
 
															-        if word_embeddings.dtype in [torch.float16, torch.bfloat16] and \
														
 
															-            word_embeddings.device.type == 'cpu':
														
 
															+        if word_embeddings.dtype in [torch.float16, torch.bfloat16] and word_embeddings.device.type == "cpu":
														
 
															             lm_logits = self.chunked_forward(hidden_states)
														
 
															         else:
														
 
															             # Switch dtype in case word_embeddings are fp16/bf16
														
@@ -459,20 +476,20 @@ class LMHead(nn.Module):
 
															         return lm_logits
														
 
															     def chunked_forward(self, hidden_states):
														
 
															-        """ Splits word embeddings on chunks and iteratively casts them into fp32 to perform matmul more efficiently on CPU. 
														
 
															-            chunk_size: provides trade-off between efficiency and extra memory consumption. 
														
 
															+        """Splits word embeddings on chunks and iteratively casts them into fp32 to perform matmul more efficiently on CPU.
														
 
															+        chunk_size: provides trade-off between efficiency and extra memory consumption.
														
 
															         """
														
 
															         assert self.chunk_size > 0, "Chunk size for chunked forward must be positive"
														
 
															         word_embeddings = self.word_embeddings.weight
														
 
															         num_embeddings = self.word_embeddings.num_embeddings
														
 
															-        hidden_states = hidden_states.float()    
														
 
															+        hidden_states = hidden_states.float()
														
 
															         output = torch.zeros(*hidden_states.shape[:-1], num_embeddings)
														
 
															         for i in range(0, num_embeddings, self.chunk_size):
														
 
															-            chunk = word_embeddings[i: i + self.chunk_size].float()
														
 
															-            output[..., i: i + self.chunk_size] = F.linear(hidden_states, chunk)
														
 
															+            chunk = word_embeddings[i : i + self.chunk_size].float()
														
 
															+            output[..., i : i + self.chunk_size] = F.linear(hidden_states, chunk)
														
 
															         return output
														
@@ -565,7 +582,7 @@ class BloomForSequenceClassification(BloomPreTrainedModel):
 
															                     f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
														
 
															                     "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
														
 
															                 )
														
 
															-            
														
 
															+
														
 
															         pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
														
 
															         loss = None
														
--- a/src/client/__init__.py
+++ b/src/client/__init__.py
@@ -1,4 +1,4 @@
 
															 from src.client.remote_block import RemoteTransformerBlock, RemoteTransformerBlockInferenceSession
														
 
															 from src.client.remote_model import DistributedBloomConfig, DistributedBloomForCausalLM, DistributedBloomModel
														
 
															-from src.client.remote_sequence_info import RemoteSequenceInfo
														
 
															 from src.client.remote_sequential import RemoteSequential
														
 
															+from src.client.sequence_manager import RemoteSequenceManager
														
--- a/src/client/remote_model.py
+++ b/src/client/remote_model.py
@@ -2,15 +2,20 @@
 
															 import os
														
 
															 from typing import Optional, Tuple
														
 
															+import hivemind
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															-
														
 
															-import hivemind
														
 
															 from hivemind import get_logger, use_hivemind_log_handler
														
 
															-from src.bloom.model import BloomConfig, BloomForCausalLM, BloomModel, BloomPreTrainedModel, LMHead, BloomForSequenceClassification
														
 
															+from src.bloom.model import (
														
 
															+    BloomConfig,
														
 
															+    BloomForCausalLM,
														
 
															+    BloomForSequenceClassification,
														
 
															+    BloomModel,
														
 
															+    BloomPreTrainedModel,
														
 
															+    LMHead,
														
 
															+)
														
 
															 from src.client.remote_sequential import RemoteSequential
														
 
															-from src.data_structures import UID_DELIMITER
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
@@ -25,12 +30,13 @@ class DistributedBloomConfig(BloomConfig):
 
															     initial_peers: Tuple[str, ...] = ()  # a list of initial peers for hivemind DHT
														
 
															     dht_prefix: str  # a prefix for all dht keys that correspond to this model (usually equal to model name)
														
 
															     dht: Optional[hivemind.DHT] = None  # a running DHT instance, e.g. when using the same DHT for multiple models
														
 
															-    chunk_size_for_efficient_fp16_on_cpu: int = 10000 # a chunk size for a LM head for efficient half-precision on CPU
														
 
															-    num_prefix_tokens: int = 0 # a number of tokens for prompt tuning. 
														
 
															+    chunk_size_for_efficient_fp16_on_cpu: int = 10000  # a chunk size for a LM head for efficient half-precision on CPU
														
 
															+    num_prefix_tokens: int = 0  # a number of tokens for prompt tuning.
														
 
															 class DistributedBloomModel(BloomModel):
														
 
															     """BloomModel, but all transformer layers are hosted by the swarm"""
														
 
															+
														
 
															     config_class = DistributedBloomConfig
														
 
															     def __init__(self, config: DistributedBloomConfig):
														
@@ -49,7 +55,7 @@ class DistributedBloomModel(BloomModel):
 
															         )
														
 
															         assert isinstance(dht, hivemind.DHT) and dht.is_alive(), "dht must be a running hivemind.DHT instance"
														
 
															         self.h = RemoteSequential(config, dht, config.dht_prefix)
														
 
															-    
														
 
															+
														
 
															         # Forbid accumulate grads for embeddings and layernorm
														
 
															         self.set_requires_grad(False)
														
@@ -57,6 +63,14 @@ class DistributedBloomModel(BloomModel):
 
															         for p in self.parameters():
														
 
															             p.requires_grad = value
														
 
															+    def forward(self, *args, use_cache=None, **kwargs):
														
 
															+        if use_cache:
														
 
															+            raise ValueError(
														
 
															+                "Distributed forward does not support use_cache; for efficient cache-aware generation, "
														
 
															+                "please use model.transformer.inference_session() or model.generate(...)"
														
 
															+            )
														
 
															+        return super().forward(*args, use_cache=False, **kwargs)
														
 
															+
														
 
															 class DistributedBloomPrefix(DistributedBloomModel):
														
 
															     """DistributedBloomModel with prefix tokens for prompt tuning"""
														
@@ -76,7 +90,7 @@ class DistributedBloomPrefix(DistributedBloomModel):
 
															         return prompts
														
 
															     def forward(
														
 
															-        self, 
														
 
															+        self,
														
 
															         input_ids: Optional[torch.LongTensor],
														
 
															         inputs_embeds: Optional[torch.Tensor],
														
 
															         attention_mask: Optional[torch.Tensor],
														
@@ -86,14 +100,16 @@ class DistributedBloomPrefix(DistributedBloomModel):
 
															         use_cache=None,
														
 
															         output_attentions=None,
														
 
															         output_hidden_states=None,
														
 
															-        return_dict=None
														
 
															+        return_dict=None,
														
 
															     ):
														
 
															-        assert input_ids is None or inputs_embeds is None, "You cannot specify both input_ids and inputs_embeds at the same time"
														
 
															+        assert (
														
 
															+            input_ids is None or inputs_embeds is None
														
 
															+        ), "You cannot specify both input_ids and inputs_embeds at the same time"
														
 
															         assert input_ids is not None or inputs_embeds is not None, "You must specify either input_ids or inputs_embeds"
														
 
															-        
														
 
															+
														
 
															         if inputs_embeds is None:
														
 
															             inputs_embeds = self.word_embeddings(input_ids)
														
 
															-    
														
 
															+
														
 
															         batch_size = inputs_embeds.shape[0]
														
 
															         if attention_mask is not None:
														
@@ -104,25 +120,26 @@ class DistributedBloomPrefix(DistributedBloomModel):
 
															         inputs_embeds = torch.cat([prompts, inputs_embeds], dim=1)
														
 
															         transformer_outputs = super().forward(
														
 
															-            inputs_embeds=inputs_embeds, 
														
 
															-            attention_mask=attention_mask, 
														
 
															+            inputs_embeds=inputs_embeds,
														
 
															+            attention_mask=attention_mask,
														
 
															             past_key_values=past_key_values,
														
 
															             position_ids=position_ids,
														
 
															             head_mask=head_mask,
														
 
															             use_cache=use_cache,
														
 
															             output_attentions=output_attentions,
														
 
															             output_hidden_states=output_hidden_states,
														
 
															-            return_dict=return_dict
														
 
															+            return_dict=return_dict,
														
 
															         )
														
 
															         # Remove prefix
														
 
															-        last_hidden_state = transformer_outputs[0][:, self.prefix_length:]
														
 
															-        transformer_outputs['last_hidden_state'] = last_hidden_state
														
 
															+        last_hidden_state = transformer_outputs[0][:, self.prefix_length :]
														
 
															+        transformer_outputs["last_hidden_state"] = last_hidden_state
														
 
															         return transformer_outputs
														
 
															 class DistributedBloomForCausalLM(BloomForCausalLM):
														
 
															-    """DistributedBloomForCausalLM, but all transformer layers are hosted by the swarm"""
														
 
															+    """Similar to BloomForCausalLM, but all transformer layers are hosted by the swarm"""
														
 
															+
														
 
															     config_class = DistributedBloomConfig
														
 
															     def __init__(self, config: DistributedBloomConfig):
														
@@ -136,11 +153,23 @@ class DistributedBloomForCausalLM(BloomForCausalLM):
 
															         # Initialize weights and apply final processing
														
 
															         self.post_init()
														
 
															-    def get_output_embeddings(self):
														
 
															-        return self.lm_head.word_embeddings
														
 
															+    def get_input_embeddings(self):
														
 
															+        return self.transformer.word_embeddings
														
 
															-    def set_output_embeddings(self, new_embeddings):
														
 
															-        self.lm_head.word_embeddings.weight = new_embeddings.weight
														
 
															+    def get_output_embeddings(self):
														
 
															+        if self.config.tie_word_embeddings:
														
 
															+            return None
														
 
															+        return self.lm_head
														
 
															+
														
 
															+    def set_input_embeddings(self, new_embeddings: nn.Embedding):
														
 
															+        assert isinstance(new_embeddings, nn.Embedding)
														
 
															+        self.transformer.word_embeddings = self.lm_head.word_embeddings = new_embeddings
														
 
															+        assert self.lm_head.bias is None or len(self.lm_head.bias) == new_embeddings.num_embeddings
														
 
															+
														
 
															+    def set_output_embeddings(self, new_lm_head: nn.Linear):
														
 
															+        with torch.no_grad():
														
 
															+            self.lm_head.word_embeddings.weight[...] = new_lm_head.weight
														
 
															+            self.lm_head.bias[...] = new_lm_head.bias
														
 
															 class DistributedBloomForSequenceClassification(BloomForSequenceClassification):
														
--- a/src/client/remote_sequential.py
+++ b/src/client/remote_sequential.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
															 import contextlib
														
 
															 import logging
														
 
															 import random
														
 
															+from typing import Optional, Union
														
 
															 import torch
														
 
															 from hivemind import DHT, P2P, get_logger, use_hivemind_log_handler
														
@@ -12,7 +13,7 @@ from torch import nn
 
															 import src
														
 
															 from src.client.remote_block import RemoteTransformerBlock
														
 
															-from src.client.remote_sequence_info import RemoteSequenceInfo
														
 
															+from src.client.sequence_manager import RemoteSequenceManager
														
 
															 from src.data_structures import UID_DELIMITER
														
 
															 from src.dht_utils import _create_remote_modules_from_infos
														
@@ -25,7 +26,15 @@ class RemoteSequential(nn.Module):
 
															     A sequence of transformer blocks hosted by the swarm.
														
 
															     """
														
 
															-    def __init__(self, config: src.DistributedBloomConfig, dht: DHT, prefix: str, max_retries: int = 3):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        config: src.DistributedBloomConfig,
														
 
															+        dht: DHT,
														
 
															+        prefix: str,
														
 
															+        max_retries: int = 3,
														
 
															+        p2p: Optional[P2P] = None,
														
 
															+        sequence_manager: Optional[RemoteSequenceManager] = None,
														
 
															+    ):
														
 
															         logger.warning(f"{self.__class__.__name__} is in active development; expect adventures")
														
 
															         if prefix.endswith(UID_DELIMITER):
														
 
															             logger.warning(
														
@@ -39,12 +48,17 @@ class RemoteSequential(nn.Module):
 
															         self.dht = dht
														
 
															         self.prefix = prefix
														
 
															         self.max_retries = max_retries
														
 
															-        self.p2p = RemoteExpertWorker.run_coroutine(dht.replicate_p2p())
														
 
															-
														
 
															-        block_uids = tuple(f"{prefix}{UID_DELIMITER}{i}" for i in range(config.n_layer))
														
 
															-
														
 
															-        logger.debug(f"Remote block uids: {block_uids}")
														
 
															-        self.remote_sequence_info = RemoteSequenceInfo(dht, block_uids)
														
 
															+        self.p2p = RemoteExpertWorker.run_coroutine(dht.replicate_p2p()) if p2p is None else p2p
														
 
															+
														
 
															+        block_uids = [f"{prefix}{UID_DELIMITER}{i}" for i in range(config.n_layer)]
														
 
															+        if sequence_manager is None:
														
 
															+            logger.debug(f"Creating new sequence manager for block uids: {block_uids}")
														
 
															+            self.sequence_manager = RemoteSequenceManager(dht, block_uids)
														
 
															+            self.is_subsequence = False
														
 
															+        else:
														
 
															+            assert isinstance(sequence_manager.block_uids, list)
														
 
															+            logger.debug(f"Reusing sequence manager with {len(self.sequence_manager)}")
														
 
															+            self.is_subsequence = self.sequence_manager.block_uids == block_uids
														
 
															     def forward(self, inputs: torch.Tensor):
														
 
															         assert isinstance(inputs, torch.Tensor) and inputs.ndim == 3 and inputs.shape[-1] == self.config.n_embed
														
@@ -64,27 +78,38 @@ class RemoteSequential(nn.Module):
 
															                         logging.debug(f"Caught {e} when running forward for block {block_index}", exc_info=True)
														
 
															         return inputs
														
 
															-    def __getitem__(self, block_index: int):
														
 
															-        assert 0 <= block_index < self.config.n_layer
														
 
															-        (module,) = _create_remote_modules_from_infos([self.remote_sequence_info.block_infos[block_index]], self.p2p)
														
 
															-        return module
														
 
															+    def __getitem__(self, ix: Union[int, slice]) -> Union[RemoteTransformerBlock, RemoteSequential]:
														
 
															+        assert isinstance(ix, (int, slice))
														
 
															+        if isinstance(ix, int):
														
 
															+            assert 0 <= ix < self.config.n_layer
														
 
															+            (module,) = _create_remote_modules_from_infos([self.sequence_manager.block_infos[ix]], self.p2p)
														
 
															+            return module
														
 
															+        else:
														
 
															+            return RemoteSequential(
														
 
															+                self.config,
														
 
															+                self.dht,
														
 
															+                prefix=self.prefix,
														
 
															+                max_retries=self.max_retries,
														
 
															+                p2p=self.p2p,
														
 
															+                sequence_manager=self.sequence_manager[ix],
														
 
															+            )
														
 
															     def __iter__(self):
														
 
															         for block_index in range(self.config.n_layer):
														
 
															             yield self[block_index]
														
 
															     def __len__(self):
														
 
															-        return len(self.remote_sequence_info)
														
 
															+        return len(self.sequence_manager)
														
 
															     def inference_session(self) -> RemoteSequentialInferenceSession:
														
 
															-        self.remote_sequence_info.update_()
														
 
															-        return RemoteSequentialInferenceSession(self.remote_sequence_info, self.p2p)
														
 
															+        self.sequence_manager.update_()
														
 
															+        return RemoteSequentialInferenceSession(self.sequence_manager, self.p2p)
														
 
															 class RemoteSequentialInferenceSession:
														
 
															     """An interface to a multi-step *inference* session for a sequence of remote transformer blocks"""
														
 
															-    def __init__(self, remote_sequence_info: RemoteSequenceInfo, p2p: P2P):
														
 
															+    def __init__(self, remote_sequence_info: RemoteSequenceManager, p2p: P2P):
														
 
															         self.remote_sequence_info = remote_sequence_info
														
 
															         self.p2p = p2p
														
 
															         self.closed = False
														
--- a/src/client/remote_sequence_info.py
+++ b/src/client/remote_sequence_info.py
@@ -1,29 +1,27 @@
 
															 from __future__ import annotations
														
 
															 import threading
														
 
															-from typing import List, NamedTuple, Optional, Sequence, Tuple
														
 
															+from typing import List, Optional, Sequence, Tuple, Union
														
 
															-from hivemind import DHT, PeerID
														
 
															+from hivemind import DHT, DHTExpiration
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															-from src.data_structures import ModuleUID, RemoteModuleInfo, ServerState
														
 
															+from src.data_structures import ModuleUID, RemoteModuleInfo, RemoteSpanInfo, ServerState
														
 
															 from src.dht_utils import get_remote_module_infos
														
 
															 use_hivemind_log_handler("in_root_logger")
														
 
															 logger = get_logger(__file__)
														
 
															-Span = NamedTuple("Span", [("start", int), ("end", Optional[int]), ("peer_id", PeerID)])
														
 
															-
														
 
															-
														
 
															-class RemoteSequenceInfo:
														
 
															+class RemoteSequenceManager:
														
 
															     """Keeps and updates the meta-information about which peers host which blocks"""
														
 
															     dht: DHT
														
 
															     block_uids: List[ModuleUID]
														
 
															     block_infos: List[Optional[RemoteModuleInfo]]
														
 
															-    spans_by_priority: List[Span]  # sorted from best to worst
														
 
															-    spans_containing_block: Tuple[List[Span]]
														
 
															+    spans_by_priority: List[RemoteSpanInfo]  # sorted from best to worst
														
 
															+    spans_containing_block: Tuple[List[RemoteSpanInfo], ...]
														
 
															+    last_update_time: DHTExpiration
														
 
															     lock_changes: threading.Lock
														
 
															     def __init__(self, dht: DHT, block_uids: Sequence[ModuleUID]):
														
@@ -32,6 +30,7 @@ class RemoteSequenceInfo:
 
															         self.block_infos = [None] * len(self.block_uids)
														
 
															         self.spans_by_priority = []
														
 
															         self.spans_containing_block = tuple(list() for _ in range(len(self.block_uids)))
														
 
															+        self.last_update_time = -float("inf")
														
 
															         self.lock_changes = threading.Lock()
														
 
															         self.update_()
														
@@ -39,6 +38,18 @@ class RemoteSequenceInfo:
 
															             assert info is not None, f"Found no remote peers for block {uid}"
														
 
															         assert self.spans_by_priority and self.spans_containing_block
														
 
															+    def __getitem__(self, ix: Union[int, slice]) -> RemoteSequenceManager:
														
 
															+        """Get a RemoteSequenceManager for a sub-sequence of blocks"""
														
 
															+        assert isinstance(ix, (int, slice))
														
 
															+        if not isinstance(ix, slice):
														
 
															+            ix = slice(int(ix), int(ix) + 1, 1)
														
 
															+        with self.lock_changes:
														
 
															+            subseq = RemoteSequenceManager(self.dht, self.block_uids[ix])
														
 
															+            subseq.block_infos = self.block_infos[ix]
														
 
															+            subseq.spans_by_priority, subseq.spans_containing_block = subseq.compute_spans(subseq.block_infos)
														
 
															+            subseq.last_update_time = self.last_update_time
														
 
															+        return subseq
														
 
															+
														
 
															     def update_(self):
														
 
															         with self.lock_changes:
														
 
															             self.update_block_infos_()
														
@@ -67,15 +78,15 @@ class RemoteSequenceInfo:
 
															                 if server.state != ServerState.ONLINE:
														
 
															                     continue
														
 
															                 if peer_id not in active_spans:
														
 
															-                    active_spans[peer_id] = Span(start=block_index, end=block_index + 1, peer_id=peer_id)
														
 
															+                    active_spans[peer_id] = RemoteSpanInfo(start=block_index, end=block_index + 1, peer_id=peer_id)
														
 
															                 else:  # peer_id in active_spans
														
 
															-                    active_spans[peer_id] = active_spans[peer_id]._replace(end=block_index + 1)
														
 
															+                    active_spans[peer_id].end = block_index + 1
														
 
															             for peer_id in list(active_spans.keys()):
														
 
															                 if (
														
 
															-                    peer_id not in info.servers or
														
 
															-                    info.servers[peer_id].state != ServerState.ONLINE or
														
 
															-                    block_index == len(block_infos) - 1
														
 
															+                    peer_id not in info.servers
														
 
															+                    or info.servers[peer_id].state != ServerState.ONLINE
														
 
															+                    or block_index == len(block_infos) - 1
														
 
															                 ):
														
 
															                     closed_spans.append(active_spans.pop(peer_id))
														
 
															         assert not active_spans
														
--- a/src/data_structures.py
+++ b/src/data_structures.py
@@ -23,5 +23,16 @@ class ServerInfo:
 
															 @dataclass
														
 
															 class RemoteModuleInfo:
														
 
															+    """A remote module that is served by one or more servers"""
														
 
															+
														
 
															     uid: ModuleUID
														
 
															     servers: Dict[PeerID, ServerInfo]
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class RemoteSpanInfo:
														
 
															+    """A chain of remote blocks served by one specific remote peer"""
														
 
															+
														
 
															+    start: int
														
 
															+    end: int
														
 
															+    peer_id: PeerID
														
--- a/src/dht_utils.py
+++ b/src/dht_utils.py
@@ -136,8 +136,12 @@ async def _get_remote_module_infos(
 
															             try:
														
 
															                 peer_id = PeerID.from_base58(peer_id)
														
 
															                 state, throughput = server_info.value
														
 
															-                if not (isinstance(state, int) and isinstance(throughput, float) and
														
 
															-                        math.isfinite(throughput) and throughput >= 0.0):
														
 
															+                if not (
														
 
															+                    isinstance(state, int)
														
 
															+                    and isinstance(throughput, float)
														
 
															+                    and math.isfinite(throughput)
														
 
															+                    and throughput >= 0.0
														
 
															+                ):
														
 
															                     raise ValueError(f"Invalid server info: {server_info}")
														
 
															                 servers[peer_id] = ServerInfo(ServerState(state), throughput)
														
 
															             except (TypeError, ValueError) as e:
														
--- a/src/server/block_selection.py
+++ b/src/server/block_selection.py
@@ -9,10 +9,10 @@ def choose_best_blocks(num_blocks: int, remote_module_infos: List[Optional[Remot
 
															         if module is None:
														
 
															             throughputs.append(0)
														
 
															             continue
														
 
															-        throughputs.append(sum(server.throughput for server in module.servers.values()
														
 
															-                               if server.state != ServerState.OFFLINE))
														
 
															+        throughputs.append(
														
 
															+            sum(server.throughput for server in module.servers.values() if server.state != ServerState.OFFLINE)
														
 
															+        )
														
 
															-    options = [(sorted(throughputs[i:i + num_blocks]), i)
														
 
															-               for i in range(0, len(throughputs) - num_blocks + 1)]
														
 
															+    options = [(sorted(throughputs[i : i + num_blocks]), i) for i in range(0, len(throughputs) - num_blocks + 1)]
														
 
															     best_start = min(options)[1]
														
 
															     return list(range(best_start, best_start + num_blocks))
														
--- a/src/server/server.py
+++ b/src/server/server.py
@@ -4,7 +4,7 @@ import multiprocessing as mp
 
															 import random
														
 
															 import threading
														
 
															 import time
														
 
															-from typing import Dict, Literal, Optional, Sequence, Union
														
 
															+from typing import Dict, Optional, Sequence, Union
														
 
															 import torch
														
 
															 from hivemind import DHT, MAX_DHT_TIME_DISCREPANCY_SECONDS, BatchTensorDescriptor, get_dht_time
														
@@ -13,7 +13,7 @@ from hivemind.moe.server.runtime import Runtime
 
															 from hivemind.proto.runtime_pb2 import CompressionType
														
 
															 from hivemind.utils.logging import get_logger, use_hivemind_log_handler
														
 
															-from src import declare_active_modules, BloomConfig
														
 
															+from src import BloomConfig, declare_active_modules
														
 
															 from src.bloom.from_pretrained import DTYPE_MAP, load_pretrained_block
														
 
															 from src.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ServerState
														
 
															 from src.dht_utils import get_remote_module_infos
														
@@ -98,7 +98,7 @@ class Server(threading.Thread):
 
															         cls,
														
 
															         prefix: Optional[str],
														
 
															         converted_model_name_or_path: str,
														
 
															-        throughput: Union[float, Literal['auto', 'eval']],
														
 
															+        throughput: Union[float, str],
														
 
															         num_blocks: Optional[int] = None,
														
 
															         block_indices: Optional[str] = None,
														
 
															         num_handlers: Optional[int] = None,
														
@@ -140,17 +140,15 @@ class Server(threading.Thread):
 
															         device = device or ("cuda" if torch.cuda.is_available() else "cpu")
														
 
															         memory_cache = MemoryCache(device, cache_size_bytes)
														
 
															-        assert isinstance(throughput, float) or throughput in ['auto', 'eval']
														
 
															-        if throughput in ['auto', 'eval']:
														
 
															-            throughput = get_host_throughput(device, force_eval=(throughput == 'eval'))
														
 
															+        assert isinstance(throughput, float) or throughput in ["auto", "eval"]
														
 
															+        if throughput in ["auto", "eval"]:
														
 
															+            throughput = get_host_throughput(device, force_eval=(throughput == "eval"))
														
 
															         if isinstance(torch_dtype, str):
														
 
															             torch_dtype = DTYPE_MAP[torch_dtype]
														
 
															         assert torch_dtype in DTYPE_MAP.values(), f"torch_dtype must be one of {list(DTYPE_MAP.values())}"
														
 
															-        block_config = BloomConfig.from_pretrained(
														
 
															-            converted_model_name_or_path, use_auth_token=use_auth_token
														
 
															-        )
														
 
															+        block_config = BloomConfig.from_pretrained(converted_model_name_or_path, use_auth_token=use_auth_token)
														
 
															         if block_indices is not None:
														
 
															             try:
														
@@ -288,7 +286,7 @@ class ModuleAnnouncerThread(threading.Thread):
 
															         throughput: float,
														
 
															         update_period: float = 30,
														
 
															         expiration: float,
														
 
															-        **kwargs
														
 
															+        **kwargs,
														
 
															     ):
														
 
															         super().__init__(**kwargs)
														
 
															         self.module_backends = module_backends
														
--- a/src/server/throughput.py
+++ b/src/server/throughput.py
@@ -20,10 +20,10 @@ use_hivemind_log_handler("in_root_logger")
 
															 logger = get_logger(__file__)
														
 
															-DEFAULT_CACHE_PATH = Path(Path.home(), '.cache', project_name, 'throughput.json')
														
 
															-DEFAULT_LOCK_PATH = Path(tempfile.gettempdir(), project_name, 'throughput.lock')
														
 
															+DEFAULT_CACHE_PATH = Path(Path.home(), ".cache", project_name, "throughput.json")
														
 
															+DEFAULT_LOCK_PATH = Path(tempfile.gettempdir(), project_name, "throughput.lock")
														
 
															-SPEED_TEST_PATH = Path(Path(__file__).absolute().parents[2], 'cli', 'speed_test.py')
														
 
															+SPEED_TEST_PATH = Path(Path(__file__).absolute().parents[2], "cli", "speed_test.py")
														
 
															 @dataclass
														
@@ -43,7 +43,7 @@ def get_host_throughput(
 
															     # We use the system-wide lock since only one process at a time can measure the host throughput
														
 
															     os.makedirs(lock_path.parent, exist_ok=True)
														
 
															-    with open(lock_path, 'wb') as lock_fd:
														
 
															+    with open(lock_path, "wb") as lock_fd:
														
 
															         logger.info("Loading throughput info")
														
 
															         fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX)
														
 
															         # The OS will release the lock when lock_fd is closed or the process is killed
														
@@ -63,7 +63,7 @@ def get_host_throughput(
 
															             info = measure_throughput_info()
														
 
															             try:
														
 
															                 os.makedirs(cache_path.parent, exist_ok=True)
														
 
															-                with open(cache_path, 'w') as cache_fd:
														
 
															+                with open(cache_path, "w") as cache_fd:
														
 
															                     json.dump(asdict(info), cache_fd)
														
 
															             except Exception:
														
 
															                 logger.exception(f"Failed to save throughput info in {cache_path}")
														
@@ -73,29 +73,30 @@ def get_host_throughput(
 
															 def measure_throughput_info() -> ThroughputInfo:
														
 
															-    logger.info("Measuring network, CPU, and GPU throughput. "
														
 
															-                "This takes about a minute and will be cached for future runs")
														
 
															+    logger.info(
														
 
															+        "Measuring network, CPU, and GPU throughput. " "This takes about a minute and will be cached for future runs"
														
 
															+    )
														
 
															     # We measure throughput in "(inference) requests per second" (RPS) using a fixed model
														
 
															-    config = BloomConfig.from_pretrained('bigscience/test-bloomd-6b3')
														
 
															+    config = BloomConfig.from_pretrained("bigscience/test-bloomd-6b3")
														
 
															     network_rps = measure_network_rps(config)
														
 
															-    device_rps = {'cpu': measure_device_rps('cpu', config)}
														
 
															+    device_rps = {"cpu": measure_device_rps("cpu", config)}
														
 
															     if torch.cuda.is_available():
														
 
															-        device_rps['cuda'] = measure_device_rps('cuda', config)
														
 
															+        device_rps["cuda"] = measure_device_rps("cuda", config)
														
 
															     return ThroughputInfo(network_rps=network_rps, device_rps=device_rps)
														
 
															 def measure_network_rps(config: BloomConfig) -> float:
														
 
															-    proc = subprocess.run([SPEED_TEST_PATH, '--json'], capture_output=True)
														
 
															+    proc = subprocess.run([SPEED_TEST_PATH, "--json"], capture_output=True)
														
 
															     if proc.returncode != 0:
														
 
															         raise RuntimeError(f"Failed to measure network throughput (stdout: {proc.stdout}, stderr: {proc.stderr})")
														
 
															     network_info = json.loads(proc.stdout)
														
 
															     bits_per_request = config.hidden_size * 32
														
 
															-    network_rps = min(network_info['download'], network_info['upload']) / bits_per_request
														
 
															+    network_rps = min(network_info["download"], network_info["upload"]) / bits_per_request
														
 
															     logger.info(
														
 
															         f"Network throughput: "
														
@@ -120,7 +121,7 @@ def measure_device_rps(device: str, config: BloomConfig, layer_index: int = 0, n
 
															             elapsed += time.perf_counter() - start_time
														
 
															         device_rps = n_steps / elapsed
														
 
															-    device_name = f"{torch.cuda.get_device_name(0)} GPU" if device == 'cuda' else 'CPU'
														
 
															+    device_name = f"{torch.cuda.get_device_name(0)} GPU" if device == "cuda" else "CPU"
														
 
															     logger.info(f"Compute throughput ({device_name}): {device_rps:.2f} RPS")
														
 
															     return device_rps
														
--- a/tests/test.id
+++ b/tests/test.id
--- a/tests/test_block_exact_match.py
+++ b/tests/test_block_exact_match.py
@@ -3,6 +3,7 @@ import os
 
															 import hivemind
														
 
															 import torch
														
 
															+import transformers
														
 
															 from src.bloom.from_pretrained import load_pretrained_block
														
 
															 from src.client.remote_block import RemoteTransformerBlock
														
@@ -19,16 +20,18 @@ if not BLOCK_UID:
 
															     raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
														
 
															 REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
														
 
															-REF_INDEX = int(os.environ.get("REF_INDEX", BLOCK_UID[-1].split(".")[-1]))
														
 
															+REF_INDEX = int(os.environ.get("REF_INDEX", BLOCK_UID.split(".")[-1]))
														
 
															 def test_remote_block_exact_match(atol_forward=1e-5, atol_inference=1e-3):
														
 
															     dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															+
														
 
															     remote_block = get_remote_module(dht, BLOCK_UID)
														
 
															     assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
														
 
															     assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															+    ref_config = transformers.AutoConfig.from_pretrained(REF_NAME)
														
 
															-    inputs = torch.randn(1, 8, 4096)
														
 
															+    inputs = torch.randn(1, 8, ref_config.hidden_size)
														
 
															     (outputs_forward,) = remote_block(inputs)
														
 
															     outputs_inference = []
														
--- a/tests/test_chained_calls.py
+++ b/tests/test_chained_calls.py
@@ -0,0 +1,97 @@
 
															+######
														
 
															+# Warning:torch this test is a work in progress. It will be modified soon.
														
 
															+# - if you want more stable tests, see test_block_exact_match
														
 
															+# - if you want to figure out chained inference, ask yozh
														
 
															+
														
 
															+import os
														
 
															+
														
 
															+import hivemind
														
 
															+import torch
														
 
															+import transformers
														
 
															+from hivemind.moe.expert_uid import UID_DELIMITER, ExpertInfo
														
 
															+
														
 
															+from src.bloom.from_pretrained import load_pretrained_block
														
 
															+from src.client.remote_block import RemoteTransformerBlock
														
 
															+from src.dht_utils import get_remote_module
														
 
															+
														
 
															+INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
														
 
															+if not INITIAL_PEERS:
														
 
															+    raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
														
 
															+INITIAL_PEERS = INITIAL_PEERS.split()
														
 
															+
														
 
															+
														
 
															+MODEL_NAME = os.environ.get("MODEL_NAME")
														
 
															+if not MODEL_NAME:
														
 
															+    raise RuntimeError("Must specify MODEL_NAME as a name of a model to be tested")
														
 
															+
														
 
															+REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
														
 
															+
														
 
															+
														
 
															+def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
														
 
															+    dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															+    config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
														
 
															+    remote_block = get_remote_module(dht, f"{MODEL_NAME}{UID_DELIMITER}0")
														
 
															+    assert remote_block is not None, f"Could not find {MODEL_NAME}{UID_DELIMITER}0 in DHT"
														
 
															+    assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															+
														
 
															+    _ = remote_block.info  # lazy-init info now, because otherwise we will _break_ info init by chaning _info
														
 
															+    remote_block._info = ExpertInfo(f"{MODEL_NAME}.3 {MODEL_NAME}.4 {MODEL_NAME}.5", remote_block._info.peer_id)
														
 
															+
														
 
															+    ref_blocks = [
														
 
															+        load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
														
 
															+        load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
														
 
															+        load_pretrained_block(REF_NAME, 5, torch_dtype=torch.float32),
														
 
															+    ]
														
 
															+    inputs = torch.randn(1, seq_length, config.hidden_size, requires_grad=True)
														
 
															+    outputs_rpc = remote_block.forward(inputs)[0]
														
 
															+    outputs_rpc.sum().backward()
														
 
															+    grads_rpc = inputs.grad
														
 
															+
														
 
															+    inputs.grad = None
														
 
															+    hidden_states = inputs
														
 
															+    for ref_block in ref_blocks:
														
 
															+        hidden_states = ref_block.forward(hidden_states)[0]
														
 
															+    outputs_ref = hidden_states
														
 
															+    outputs_ref.sum().backward()
														
 
															+    grads_ref = inputs.grad
														
 
															+
														
 
															+    assert torch.allclose(outputs_ref, outputs_rpc, rtol=0, atol=atol_forward)
														
 
															+    assert torch.allclose(grads_ref, grads_rpc, rtol=0, atol=atol_backward)
														
 
															+
														
 
															+
														
 
															+def test_chained_inference_exact_match(atol_inference=1e-4):
														
 
															+    dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															+    config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
														
 
															+    remote_block = get_remote_module(dht, f"{MODEL_NAME}{UID_DELIMITER}0")
														
 
															+    assert remote_block is not None, f"Could not find {MODEL_NAME}{UID_DELIMITER}0 in DHT"
														
 
															+    assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															+
														
 
															+    _ = remote_block.info  # lazy-init info now, because otherwise we will _break_ info init by chaning _info
														
 
															+    remote_block._info = ExpertInfo(f"{MODEL_NAME}.3 {MODEL_NAME}.4", remote_block._info.peer_id)
														
 
															+
														
 
															+    inputs = torch.randn(1, 8, config.hidden_size)
														
 
															+
														
 
															+    outputs_inference = []
														
 
															+    with remote_block.inference_session() as sess:
														
 
															+        for i in range(inputs.shape[1]):
														
 
															+            outputs_inference.append(sess.step(inputs[:, i : i + 1, :]))
														
 
															+    outputs_inference = torch.cat(outputs_inference, dim=1)
														
 
															+
														
 
															+    ref_blocks = [
														
 
															+        load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
														
 
															+        load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
														
 
															+    ]
														
 
															+    outputs_ref = []
														
 
															+    caches = [None, None]
														
 
															+    for i in range(inputs.shape[1]):
														
 
															+        new_caches = []
														
 
															+        hidden_states = inputs[:, i : i + 1, :]
														
 
															+        for ref_block, cache in zip(ref_blocks, caches):
														
 
															+            with torch.no_grad():
														
 
															+                hidden_states, new_cache = ref_block.forward(hidden_states, use_cache=True, layer_past=cache)
														
 
															+                new_caches.append(new_cache)
														
 
															+
														
 
															+        outputs_ref.append(hidden_states)
														
 
															+        caches = new_caches
														
 
															+    outputs_ref = torch.cat(outputs_ref, dim=1)
														
 
															+    assert torch.allclose(outputs_ref, outputs_inference, rtol=0, atol=atol_inference)
														
--- a/tests/test_chained_forward_backward.py
+++ b/tests/test_chained_forward_backward.py
@@ -1,59 +0,0 @@
 
															-######
														
 
															-# Warning:torch this test is a work in progress. It will be modified soon.
														
 
															-# - if you want more stable tests, see test_block_exact_match
														
 
															-# - if you want to figure out chained inference, ask yozh
														
 
															-
														
 
															-import os
														
 
															-
														
 
															-import hivemind
														
 
															-import torch
														
 
															-from hivemind.moe.expert_uid import ExpertInfo
														
 
															-
														
 
															-from src.bloom.from_pretrained import load_pretrained_block
														
 
															-from src.client.remote_block import RemoteTransformerBlock
														
 
															-from src.dht_utils import get_remote_module
														
 
															-
														
 
															-INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
														
 
															-if not INITIAL_PEERS:
														
 
															-    raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
														
 
															-INITIAL_PEERS = INITIAL_PEERS.split()
														
 
															-
														
 
															-
														
 
															-BLOCK_UID = os.environ.get("BLOCK_UID")
														
 
															-if not BLOCK_UID:
														
 
															-    raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
														
 
															-
														
 
															-REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
														
 
															-
														
 
															-
														
 
															-# seq_length > 128: rpc_forward_stream & rpc_backward_stream
														
 
															-# seq_length <= 128: rpc_forward & rpc_backward
														
 
															-def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
														
 
															-    dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															-    (remote_block,) = get_remote_module(dht, BLOCK_UID)
														
 
															-    assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
														
 
															-    assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															-
														
 
															-    _ = remote_block.info  # lazy-init info now, because otherwise we will _break_ info init by chaning _info
														
 
															-    remote_block._info = ExpertInfo("bloom6b3.3 bloom6b3.4 bloom6b3.5", remote_block._info.peer_id)
														
 
															-
														
 
															-    ref_blocks = [
														
 
															-        load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
														
 
															-        load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
														
 
															-        load_pretrained_block(REF_NAME, 5, torch_dtype=torch.float32),
														
 
															-    ]
														
 
															-    inputs = torch.randn(1, seq_length, 4096, requires_grad=True)
														
 
															-    outputs_rpc = remote_block.forward(inputs)[0]
														
 
															-    outputs_rpc.sum().backward()
														
 
															-    grads_rpc = inputs.grad
														
 
															-
														
 
															-    inputs.grad = None
														
 
															-    hidden_states = inputs
														
 
															-    for ref_block in ref_blocks:
														
 
															-        hidden_states = ref_block.forward(hidden_states)[0]
														
 
															-    outputs_ref = hidden_states
														
 
															-    outputs_ref.sum().backward()
														
 
															-    grads_ref = inputs.grad
														
 
															-
														
 
															-    assert torch.allclose(outputs_ref, outputs_rpc, rtol=0, atol=atol_forward)
														
 
															-    assert torch.allclose(grads_ref, grads_rpc, rtol=0, atol=atol_backward)
														
--- a/tests/test_chained_inference.py
+++ b/tests/test_chained_inference.py
@@ -1,64 +0,0 @@
 
															-######
														
 
															-# Warning:torch this test is a work in progress. It will be modified soon.
														
 
															-# - if you want more stable tests, see test_block_exact_match
														
 
															-# - if you want to figure out chained inference, ask yozh
														
 
															-
														
 
															-import os
														
 
															-
														
 
															-import hivemind
														
 
															-import torch
														
 
															-from hivemind.moe.expert_uid import ExpertInfo
														
 
															-
														
 
															-from src.bloom.from_pretrained import load_pretrained_block
														
 
															-from src.client.remote_block import RemoteTransformerBlock
														
 
															-from src.dht_utils import get_remote_module
														
 
															-
														
 
															-INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
														
 
															-if not INITIAL_PEERS:
														
 
															-    raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
														
 
															-INITIAL_PEERS = INITIAL_PEERS.split()
														
 
															-
														
 
															-
														
 
															-BLOCK_UID = os.environ.get("BLOCK_UID")
														
 
															-if not BLOCK_UID:
														
 
															-    raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
														
 
															-
														
 
															-REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
														
 
															-REF_INDEX = int(os.environ.get("REF_INDEX", BLOCK_UID[-1].split(".")[-1]))
														
 
															-
														
 
															-
														
 
															-def test_remote_block_exact_match(atol_inference=1e-4):
														
 
															-    dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
														
 
															-    remote_block = get_remote_module(dht, BLOCK_UID)
														
 
															-    assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
														
 
															-    assert isinstance(remote_block, RemoteTransformerBlock)
														
 
															-
														
 
															-    _ = remote_block.info  # lazy-init info now, because otherwise we will _break_ info init by chaning _info
														
 
															-    remote_block._info = ExpertInfo("bloom6b3.3 bloom6b3.4", remote_block._info.peer_id)
														
 
															-
														
 
															-    inputs = torch.randn(1, 8, 4096)
														
 
															-
														
 
															-    outputs_inference = []
														
 
															-    with remote_block.inference_session() as sess:
														
 
															-        for i in range(inputs.shape[1]):
														
 
															-            outputs_inference.append(sess.step(inputs[:, i : i + 1, :]))
														
 
															-    outputs_inference = torch.cat(outputs_inference, dim=1)
														
 
															-
														
 
															-    ref_blocks = [
														
 
															-        load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
														
 
															-        load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
														
 
															-    ]
														
 
															-    outputs_ref = []
														
 
															-    caches = [None, None]
														
 
															-    for i in range(inputs.shape[1]):
														
 
															-        new_caches = []
														
 
															-        hidden_states = inputs[:, i : i + 1, :]
														
 
															-        for ref_block, cache in zip(ref_blocks, caches):
														
 
															-            with torch.no_grad():
														
 
															-                hidden_states, new_cache = ref_block.forward(hidden_states, use_cache=True, layer_past=cache)
														
 
															-                new_caches.append(new_cache)
														
 
															-
														
 
															-        outputs_ref.append(hidden_states)
														
 
															-        caches = new_caches
														
 
															-    outputs_ref = torch.cat(outputs_ref, dim=1)
														
 
															-    assert torch.allclose(outputs_ref, outputs_inference, rtol=0, atol=atol_inference)
														
--- a/tests/test_full_model.py
+++ b/tests/test_full_model.py
@@ -24,9 +24,10 @@ if not MODEL_NAME:
 
															 REF_NAME = os.environ.get("REF_NAME")
														
 
															-def test_full_model_exact_match(atol_forward=1e-5, atol_inference=1e-3, prefix="bloom6b3"):
														
 
															+def test_full_model_exact_match(atol_forward=1e-3, atol_inference=1e-3):
														
 
															     tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
														
 
															     model = DistributedBloomForCausalLM.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
														
 
															+    assert isinstance(model, DistributedBloomForCausalLM)
														
 
															     assert len(model.transformer.h) == model.config.n_layer
														
 
															     test_inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
														
@@ -35,26 +36,29 @@ def test_full_model_exact_match(atol_forward=1e-5, atol_inference=1e-3, prefix="
 
															     logger.info("Forward outputs are finite")
														
 
															     if REF_NAME:
														
 
															-        ref_model = transformers.AutoModelForCausalLM.from_pretrained(REF_NAME)
														
 
															-        dummy_mask = torch.ones_like(test_inputs, dtype=torch.bool)
														
 
															-        # note: this creates a dummy mask to make the test compatible with older transformer versions
														
 
															-        # prior to https://github.com/huggingface/transformers/pull/17837
														
 
															-        ref_outputs = ref_model.forward(test_inputs, attention_mask=dummy_mask).logits
														
 
															-        assert torch.allclose(ref_outputs, parallel_outputs, rtol=0, atol=atol_forward)
														
 
															+        with torch.no_grad():
														
 
															+            ref_model = transformers.AutoModelForCausalLM.from_pretrained(REF_NAME)
														
 
															+            dummy_mask = torch.ones_like(test_inputs, dtype=torch.bool)
														
 
															+            # note: this creates a dummy mask to make the test compatible with older transformer versions
														
 
															+            # prior to https://github.com/huggingface/transformers/pull/17837
														
 
															+            ref_outputs = ref_model.forward(test_inputs, attention_mask=dummy_mask).logits
														
 
															+            assert torch.allclose(ref_outputs, parallel_outputs, rtol=0, atol=atol_forward)
														
 
															+            del ref_model, ref_outputs
														
 
															     else:
														
 
															         logger.warning("Did not test exact match with local model: REF_NAME environment variable is not set")
														
 
															-    embs = model.transformer.word_embeddings(test_inputs)
														
 
															-    embs = model.transformer.word_embeddings_layernorm(embs)
														
 
															-    recurrent_outputs = []
														
 
															-    with model.transformer.h.inference_session() as sess:
														
 
															-        for t in range(embs.shape[1]):
														
 
															-            recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))
														
 
															-    recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
														
 
															-    recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
														
 
															-
														
 
															-    dictionary = model.transformer.word_embeddings.weight.t()
														
 
															-    recurrent_outputs = recurrent_outputs.to(dictionary.dtype)
														
 
															-    recurrent_outputs = (recurrent_outputs @ dictionary).float()
														
 
															+    with torch.inference_mode():
														
 
															+        embs = model.transformer.word_embeddings(test_inputs)
														
 
															+        embs = model.transformer.word_embeddings_layernorm(embs)
														
 
															+        recurrent_outputs = []
														
 
															+        with model.transformer.h.inference_session() as sess:
														
 
															+            for t in range(embs.shape[1]):
														
 
															+                recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))
														
 
															+        recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
														
 
															+        recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
														
 
															+
														
 
															+        dictionary = model.transformer.word_embeddings.weight.t()
														
 
															+        recurrent_outputs = recurrent_outputs.to(dictionary.dtype)
														
 
															+        recurrent_outputs = (recurrent_outputs @ dictionary).float()
														
 
															     assert torch.allclose(recurrent_outputs, parallel_outputs, rtol=0, atol=atol_inference)
														
 
															     logger.info("Inference is consistent with forward")