hace 4 años · 05c4355f8a
--- a/hivemind/hivemind_cli/run_server.py
+++ b/hivemind/hivemind_cli/run_server.py
@@ -57,6 +57,10 @@ def main():
 
															     parser.add_argument('--clip_grad_norm', type=float, required=False, help='Maximum gradient norm used for clipping')
														
 
															     parser.add_argument('--no_dht', action='store_true', help='if specified, the server will not be attached to a dht')
														
 
															+
														
 
															+    parser.add_argument('--dht_port', type=int)
														
 
															+    parser.add_argument('--dht_listen_on', type=str)
														
 
															+
														
 
															     parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[],
														
 
															                         help='multiaddrs of one or more active DHT peers (if you want to join an existing DHT)')
														
 
															     parser.add_argument('--increase_file_limit', action='store_true',
														
--- a/hivemind/moe/client/moe.py
+++ b/hivemind/moe/client/moe.py
@@ -15,7 +15,7 @@ from hivemind.moe.client.beam_search import MoEBeamSearcher
 
															 from hivemind.moe.client.expert import DUMMY, RemoteExpert, _get_expert_stub
														
 
															 from hivemind.moe.server.expert_uid import UID_DELIMITER
														
 
															 from hivemind.proto import runtime_pb2, runtime_pb2_grpc as runtime_grpc
														
 
															-from hivemind.utils import nested_flatten, nested_map, nested_pack, nested_compare
														
 
															+from hivemind.utils import nested_compare, nested_flatten, nested_map, nested_pack
														
 
															 from hivemind.utils.logging import get_logger
														
 
															 logger = get_logger(__name__)
														
--- a/hivemind/moe/server/__init__.py
+++ b/hivemind/moe/server/__init__.py
@@ -120,6 +120,8 @@ class Server(threading.Thread):
 
															         reuse_grad_buffers=True,
														
 
															         device=None,
														
 
															         no_dht=False,
														
 
															+        dht_port=None,
														
 
															+        dht_listen_on=None,
														
 
															         initial_peers=(),
														
 
															         checkpoint_dir: Optional[Path] = None,
														
 
															         compression=CompressionType.NONE,
														
@@ -175,7 +177,22 @@ class Server(threading.Thread):
 
															         if no_dht:
														
 
															             dht = None
														
 
															         else:
														
 
															-            dht = hivemind.DHT(initial_peers=initial_peers, start=True, identity_path=identity_path)
														
 
															+            dht_port = dht_port or hivemind.get_free_port()
														
 
															+            host_maddrs = [f"/ip4/0.0.0.0/tcp/{dht_port}"]
														
 
															+            announce_maddrs = []
														
 
															+
														
 
															+            if dht_listen_on is not None:
														
 
															+                dht_maddr = f"/ip6/{dht_listen_on}/tcp/{dht_port}"
														
 
															+                host_maddrs.append(dht_maddr)
														
 
															+                announce_maddrs.append(dht_maddr)
														
 
															+
														
 
															+            dht = hivemind.DHT(
														
 
															+                initial_peers=initial_peers,
														
 
															+                start=True,
														
 
															+                identity_path=identity_path,
														
 
															+                host_maddrs=host_maddrs,
														
 
															+                announce_maddrs=announce_maddrs,
														
 
															+            )
														
 
															             visible_maddrs_str = [str(a) for a in dht.get_visible_maddrs()]
														
 
															             logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
														
--- a/hivemind/moe/server/layers/albert.py
+++ b/hivemind/moe/server/layers/albert.py
@@ -61,12 +61,12 @@ class LeanFFN(nn.Module):
 
															     """
														
 
															     def __init__(
														
 
															-            self,
														
 
															-            hidden_size: int,
														
 
															-            intermediate_size: int,
														
 
															-            activation=F.gelu,
														
 
															-            gated: bool = False,
														
 
															-            layer_norm_eps: float = 1e-12,
														
 
															+        self,
														
 
															+        hidden_size: int,
														
 
															+        intermediate_size: int,
														
 
															+        activation=F.gelu,
														
 
															+        gated: bool = False,
														
 
															+        layer_norm_eps: float = 1e-12,
														
 
															     ):
														
 
															         super().__init__()
														
 
															         self.dense_i2h = nn.Linear(hidden_size, intermediate_size * 2 if gated else intermediate_size)
														
@@ -103,17 +103,17 @@ class _LeanFFN(torch.autograd.Function):
 
															     @staticmethod
														
 
															     @custom_fwd
														
 
															     def forward(
														
 
															-            ctx,
														
 
															-            input,
														
 
															-            ln_weight,
														
 
															-            ln_bias,
														
 
															-            i2h_weight,
														
 
															-            i2h_bias,
														
 
															-            h2o_weight,
														
 
															-            h2o_bias,
														
 
															-            activation,
														
 
															-            training,
														
 
															-            ln_eps,
														
 
															+        ctx,
														
 
															+        input,
														
 
															+        ln_weight,
														
 
															+        ln_bias,
														
 
															+        i2h_weight,
														
 
															+        i2h_bias,
														
 
															+        h2o_weight,
														
 
															+        h2o_bias,
														
 
															+        activation,
														
 
															+        training,
														
 
															+        ln_eps,
														
 
															     ):
														
 
															         ctx._activation, ctx._training, ctx._ln_eps = activation, training, ln_eps
														
 
															         ctx._cpu_rng_state = torch.get_rng_state()
														
@@ -179,17 +179,17 @@ class _LeanFFN(torch.autograd.Function):
 
															             grad_h2o_bias = grad_output_2d.sum(0)
														
 
															         return (
														
 
															-                grad_input,
														
 
															-                grad_ln_weight,
														
 
															-                grad_ln_bias,
														
 
															-                grad_i2h_weight,
														
 
															-                grad_i2h_bias,
														
 
															-                grad_h2o_weight,
														
 
															-                grad_h2o_bias,
														
 
															-                None,
														
 
															-                None,
														
 
															-                None,
														
 
															-                None,
														
 
															+            grad_input,
														
 
															+            grad_ln_weight,
														
 
															+            grad_ln_bias,
														
 
															+            grad_i2h_weight,
														
 
															+            grad_i2h_bias,
														
 
															+            grad_h2o_weight,
														
 
															+            grad_h2o_bias,
														
 
															+            None,
														
 
															+            None,
														
 
															+            None,
														
 
															+            None,
														
 
															         )
														
@@ -212,7 +212,7 @@ class RotaryEmbeddings(nn.Module):
 
															             self.register_buffer("cos", cos)
														
 
															             self.register_buffer("sin", sin)
														
 
															-        return rotate(x, cos[None, offset: seq_len + offset, None, :], sin[None, offset: seq_len + offset, None, :])
														
 
															+        return rotate(x, cos[None, offset : seq_len + offset, None, :], sin[None, offset : seq_len + offset, None, :])
														
 
															 @torch.no_grad()
														
@@ -243,13 +243,13 @@ def rotate(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tenso
 
															 class LeanSelfAttention(nn.Module):
														
 
															     def __init__(
														
 
															-            self,
														
 
															-            hidden_size: int,
														
 
															-            num_attention_heads: int,
														
 
															-            max_positions: int,
														
 
															-            attention_core: Optional[nn.Module] = None,
														
 
															-            layer_norm_eps: float = 1e-12,
														
 
															-            **kwargs,
														
 
															+        self,
														
 
															+        hidden_size: int,
														
 
															+        num_attention_heads: int,
														
 
															+        max_positions: int,
														
 
															+        attention_core: Optional[nn.Module] = None,
														
 
															+        layer_norm_eps: float = 1e-12,
														
 
															+        **kwargs,
														
 
															     ):
														
 
															         """Attention layer that does not hog GPU memory"""
														
 
															         super().__init__()
														
@@ -311,7 +311,7 @@ class SimpleAttentionCore(nn.Module):
 
															         attention_scores = attention_scores / math.sqrt(query.shape[-1])
														
 
															         query_length, key_length = query.size(-2), key.size(-2)
														
 
															-        causal_mask = self.bias[:, :, key_length - query_length: key_length, :key_length].bool()
														
 
															+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
														
 
															         attention_scores = torch.where(causal_mask, attention_scores, self.masked_bias.to(attention_scores.dtype))
														
 
															         if attention_mask is not None:
														
@@ -330,12 +330,12 @@ class RotaryAttentionCore(SimpleAttentionCore):
 
															     """Attention core that applies rotary embeddings to queries and keys before computing dot products"""
														
 
															     def __init__(
														
 
															-            self,
														
 
															-            hidden_size: int,
														
 
															-            num_attention_heads: int,
														
 
															-            max_positions: int,
														
 
															-            rotary_emb: Optional[RotaryEmbeddings] = None,
														
 
															-            **kwargs,
														
 
															+        self,
														
 
															+        hidden_size: int,
														
 
															+        num_attention_heads: int,
														
 
															+        max_positions: int,
														
 
															+        rotary_emb: Optional[RotaryEmbeddings] = None,
														
 
															+        **kwargs,
														
 
															     ):
														
 
															         super().__init__(hidden_size, num_attention_heads, max_positions, **kwargs)
														
 
															         if rotary_emb is None:
														
@@ -393,7 +393,7 @@ class LeanAlbertEmbeddings(nn.Module):
 
															     # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
														
 
															     def forward(
														
 
															-            self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
														
 
															+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
														
 
															     ):
														
 
															         if input_ids is not None:
														
 
															             input_shape = input_ids.size()
														
@@ -413,7 +413,7 @@ class LeanAlbertEmbeddings(nn.Module):
 
															         if self.position_embeddings is not None:
														
 
															             if position_ids is None:
														
 
															-                position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length]
														
 
															+                position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
														
 
															             position_embeddings = self.position_embeddings(position_ids)
														
 
															             embeddings += position_embeddings
														
@@ -458,8 +458,7 @@ class LeanAlbertLayerGroup(AlbertLayerGroup):
 
															         self.albert_layers = nn.ModuleList([LeanAlbertLayer(config) for _ in range(config.inner_group_num)])
														
 
															     def forward(
														
 
															-            self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False,
														
 
															-            output_hidden_states=False
														
 
															+        self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
														
 
															     ):
														
 
															         if head_mask is not None and any(head_mask):
														
 
															             raise NotImplementedError(f"head mask was provided, but it is not supported")
														
@@ -496,13 +495,13 @@ class LeanAlbertTransformer(AlbertTransformer):
 
															         self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps)
														
 
															     def forward(
														
 
															-            self,
														
 
															-            hidden_states,
														
 
															-            attention_mask=None,
														
 
															-            head_mask=None,
														
 
															-            output_attentions=False,
														
 
															-            output_hidden_states=False,
														
 
															-            return_dict=True,
														
 
															+        self,
														
 
															+        hidden_states,
														
 
															+        attention_mask=None,
														
 
															+        head_mask=None,
														
 
															+        output_attentions=False,
														
 
															+        output_hidden_states=False,
														
 
															+        return_dict=True,
														
 
															     ):
														
 
															         # TODO this should entire be replaced with inheritance and post_layer_norm
														
 
															         hidden_states = self.embedding_hidden_mapping_in(hidden_states)
														
@@ -585,7 +584,7 @@ class LeanAlbertForPreTraining(AlbertForPreTraining, PreTrainedModel):
 
															 from hivemind.moe.server.layers.custom_experts import register_expert_class
														
 
															 head_sample_input = lambda batch_size, hid_dim: (
														
 
															-        torch.randint(low=0, high=1000, size=(batch_size, 512), dtype=torch.long),
														
 
															+    torch.randint(low=0, high=1000, size=(batch_size, 512), dtype=torch.long),
														
 
															 )
														
@@ -602,7 +601,7 @@ class HeadExpert(nn.Module):
 
															     def forward(self, input_ids):
														
 
															         embedding_output = self.embeddings(input_ids)
														
 
															-        encoder_outputs, = self.encoder(embedding_output, return_dict=False)
														
 
															+        (encoder_outputs,) = self.encoder(embedding_output, return_dict=False)
														
 
															         return encoder_outputs
														
@@ -644,9 +643,8 @@ class BodyExpert(nn.Module):
 
															 tail_sample_input = lambda batch_size, hid_dim: (
														
 
															-
														
 
															-        torch.empty((batch_size, 512, hid_dim)),
														
 
															-        torch.randint(0, 1000, (batch_size, 512), dtype=torch.long),
														
 
															+    torch.empty((batch_size, 512, hid_dim)),
														
 
															+    torch.randint(0, 1000, (batch_size, 512), dtype=torch.long),
														
 
															 )
														
--- a/hivemind/utils/nested.py
+++ b/hivemind/utils/nested.py
@@ -1,6 +1,7 @@
 
															 """ utility functions that help you process nested dicts, tuples, lists and namedtuples """
														
 
															 import torch
														
 
															-from hivemind.utils.tensor_descr import TensorDescriptor, BatchTensorDescriptor
														
 
															+
														
 
															+from hivemind.utils.tensor_descr import BatchTensorDescriptor, TensorDescriptor
														
 
															 def nested_compare(t, u):