5 years ago · 656cdf9eb6
--- a/docs/_static/dht.odp
+++ b/docs/_static/dht.odp
--- a/docs/_static/dht.pdf
+++ b/docs/_static/dht.pdf
--- a/docs/_static/dht.png
+++ b/docs/_static/dht.png
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -240,7 +240,8 @@ def linkcode_resolve(domain, info):
 
															     if domain != 'py' or not info['module']:
														
 
															         return None
														
 
															     try:
														
 
															-        filename = 'hivemind/%s#L%d-L%d' % find_source()
														
 
															+        filename = '%s#L%d-L%d' % find_source()
														
 
															     except Exception:
														
 
															         filename = info['module'].replace('.', '/') + '.py'
														
 
															+
														
 
															     return "https://github.com/learning-at-home/hivemind/blob/%s/%s" % (branch, filename)
														
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,6 +20,7 @@ API documentation:
 
															   modules/client.rst
														
 
															   modules/server.rst
														
 
															+  modules/dht.rst
														
 
															 Indices and tables
														
 
															 ==================
														
--- a/docs/modules/dht.rst
+++ b/docs/modules/dht.rst
@@ -0,0 +1,44 @@
 
															+``hidemind.dht``
														
 
															+====================
														
 
															+
														
 
															+.. image:: ../_static/dht.png
														
 
															+   :width: 800
														
 
															+
														
 
															+.. automodule:: hivemind.dht
														
 
															+
														
 
															+.. currentmodule:: hivemind.dht
														
 
															+
														
 
															+
														
 
															+.. autoclass:: DHT
														
 
															+   :members:
														
 
															+   :exclude-members: make_key
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. autoclass:: DHTNode
														
 
															+   :members:
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. currentmodule:: hivemind.dht.protocol
														
 
															+
														
 
															+.. autoclass:: KademliaProtocol
														
 
															+   :members:
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. currentmodule:: hivemind.dht.routing
														
 
															+
														
 
															+.. autoclass:: RoutingTable
														
 
															+   :members:
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. autoclass:: KBucket
														
 
															+   :members:
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. autoclass:: DHTID
														
 
															+   :members:
														
 
															+   :exclude-members: HASH_FUNC
														
 
															+   :member-order: bysource
														
 
															+
														
 
															+.. currentmodule:: hivemind.dht.search
														
 
															+
														
 
															+.. autofunction:: traverse_dht
														
--- a/docs/modules/server.rst
+++ b/docs/modules/server.rst
@@ -21,5 +21,5 @@
 
															     :member-order: bysource
														
 
															 .. autoclass:: TaskPool
														
 
															-    :members: submit_task, form_batch, load_batch_to_runtime, send_outputs_from_runtime, get_task_size, empty
														
 
															+    :members: submit_task, iterate_minibatches, load_batch_to_runtime, send_outputs_from_runtime, get_task_size, empty
														
 
															     :member-order: bysource
														
--- a/hivemind/client/expert.py
+++ b/hivemind/client/expert.py
@@ -16,8 +16,8 @@ class RemoteExpert(nn.Module):
 
															     Sending wrong input shapes can cause RemoteExpert to freeze indefinitely due to error in runtime.
														
 
															     :param uid: unique expert identifier
														
 
															-    :param host: hostname where Server operates
														
 
															-    :param port: port to which Server listens
														
 
															+    :param host: hostname where server operates
														
 
															+    :param port: port to which server listens
														
 
															     """
														
 
															     def __init__(self, uid, host='127.0.0.1', port=8080):
														
--- a/hivemind/client/moe.py
+++ b/hivemind/client/moe.py
@@ -147,7 +147,13 @@ class RemoteMixtureOfExperts(nn.Module):
 
															     def compute_expert_scores(
														
 
															             self, grid_scores: List[torch.Tensor], batch_experts: List[List[RemoteExpert]]) -> torch.Tensor:
														
 
															-        """ TODO(jheuristic) docstring here """
														
 
															+        """
														
 
															+        Compute scores for each expert by adding up grid scores, autograd-friendly
														
 
															+        :param grid_scores: list of torch tensors, i-th tensor contains scores for i-th grid dimension
														
 
															+        :param batch_experts: list(batch) of lists(k) of up to k experts selected for this batch
														
 
															+        :returns: a tensor of scores, float32[batch_size, k]
														
 
															+        :note: if some rows in batch have less than max number of experts, their scores will be padded with -inf
														
 
															+        """
														
 
															         expert_counts = list(map(len, batch_experts))
														
 
															         batch_size = len(batch_experts)
														
 
															         max_num_experts = max(expert_counts)
														
--- a/hivemind/dht/__init__.py
+++ b/hivemind/dht/__init__.py
@@ -22,8 +22,9 @@ from ..utils import SharedFuture, find_open_port, Hostname, Port, run_in_backgro
 
															 class DHT(mp.Process):
														
 
															     """
														
 
															     A high-level interface to hivemind DHT. Runs a dht node in a background process.
														
 
															+
														
 
															     :param initial_peers: one or multiple pairs of (host, port) pointing to active DHT peers. Default: no peers
														
 
															-    :param port: a port where DHT will listen to incoming connections. Defaults to hivemind.utils.find_open_port
														
 
															+    :param port: a port where DHT node will listen to incoming connections. Defaults to hivemind.utils.find_open_port
														
 
															     :param start: if True, automatically starts the background process on creation. Otherwise await manual start
														
 
															     :param daemon: if True, the background process is marked as daemon and automatically terminated after main process
														
 
															     :param node_params: any other params will be forwarded to DHTNode upon creation
														
@@ -45,6 +46,7 @@ class DHT(mp.Process):
 
															             self.run_in_background(await_ready=True)
														
 
															     def run(self) -> None:
														
 
															+        """ Serve DHT forever. This function will not return until DHT node is shut down """
														
 
															         if asyncio.get_event_loop().is_running():
														
 
															             asyncio.get_event_loop().stop()  # if we're in jupyter, get rid of its built-in event loop
														
 
															         loop = asyncio.new_event_loop()
														
@@ -102,6 +104,7 @@ class DHT(mp.Process):
 
															     def declare_experts(self, uids: List[str], addr, port, wait=True, timeout=None) -> Optional[List[bool]]:
														
 
															         """
														
 
															         Make experts available to DHT; update timestamps if already available
														
 
															+
														
 
															         :param uids: a list of expert ids to update
														
 
															         :param addr: hostname that can be used to call this expert
														
 
															         :param port: port that can be used to call this expert
														
@@ -139,6 +142,7 @@ class DHT(mp.Process):
 
															     def first_k_active(self, prefixes: List[str], k: int, max_prefetch=None):
														
 
															         """
														
 
															         Find k prefixes with active experts; may return less if there aren't enough; used for DMoE beam search
														
 
															+
														
 
															         :param prefixes: a list of uid prefixes ordered from highest to lowest priority
														
 
															         :param k: return at most *this many* active prefixes
														
 
															         :param max_prefetch: pre-dispatch up to *this many* asynchronous expert requests, defaults to pre-dispatch = k
														
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -37,10 +37,12 @@ class DHTNode:
 
															     Informally, dht nodes always prefer values with higher expiration_time and may delete any value past its expiration.
														
 
															     Formally, DHTNode follows this contract:
														
 
															-      - when asked to store(key, value, expiration_time), a node must store (key, value) at least until expiration_time
														
 
															-       unless it already stores that key with greater or equal expiration_time - if so, node must keep the previous key
														
 
															-      - when asked to get(key), a node must return the value with highest expiration time IF that time has not come yet
														
 
															-       if expiration time is greater than current get_dht_time(), DHTNode *may* return None
														
 
															+
														
 
															+    - when asked to store(key, value, expiration_time), a node must store (key, value) at least until expiration_time
														
 
															+      unless it already stores that key with greater or equal expiration_time - if so, node must keep the previous key
														
 
															+    - when asked to get(key), a node must return the value with highest expiration time IF that time has not come yet
														
 
															+      if expiration time is greater than current get_dht_time(), DHTNode *may* return None
														
 
															+
														
 
															     """
														
 
															     def __init__(self, node_id: Optional[DHTID] = None, port: Optional[Port] = None, initial_peers: List[Endpoint] = (),
														
@@ -89,8 +91,9 @@ class DHTNode:
 
															                                  beam_size: Optional[int] = None, exclude_self: bool = False) -> Dict[DHTID, Endpoint]:
														
 
															         """
														
 
															         Traverse the DHT and find :k_nearest: nodes to a given :query_id:, optionally :exclude_self: from the results.
														
 
															-        :note: this is a thin wrapper over dht.search.beam_search, look there for more details
														
 
															+
														
 
															         :returns: an ordered dictionary of [peer DHTID -> network Endpoint], ordered from nearest to farthest neighbor
														
 
															+        :note: this is a thin wrapper over dht.search.traverse_dht, look there for more details
														
 
															         """
														
 
															         k_nearest = k_nearest if k_nearest is not None else self.protocol.bucket_size
														
 
															         beam_size = beam_size if beam_size is not None else max(self.protocol.bucket_size, k_nearest)
														
@@ -116,7 +119,8 @@ class DHTNode:
 
															         """
														
 
															         Find beam_size best nodes to store (key, value) and store it there at least until expiration time.
														
 
															         Also cache (key, value, expiration_time) at all nodes you met along the way (see Section 2.1 end)
														
 
															-        :return: True if store succeeds, False if it fails (due to no response or newer value)
														
 
															+
														
 
															+        :returns: True if store succeeds, False if it fails (due to no response or newer value)
														
 
															         """
														
 
															         key_id = DHTID.generate(key)
														
 
															         nearest_node_to_addr = await self.find_nearest_nodes(key_id, k_nearest=self.num_replicas, exclude_self=True)
														
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -55,6 +55,7 @@ class KademliaProtocol(RPCProtocol):
 
															                          expiration_time: DHTExpiration, in_cache: bool = False) -> Optional[bool]:
														
 
															         """
														
 
															         Ask a recipient to store (key, value) pair until expiration time or update their older value
														
 
															+
														
 
															         :returns: True if value was accepted, False if it was rejected (recipient has newer value), None if no response
														
 
															         """
														
 
															         responded, response = await self.store(recipient, bytes(self.node_id), bytes(key),
														
@@ -69,6 +70,7 @@ class KademliaProtocol(RPCProtocol):
 
															                       query_id_bytes: BinaryDHTID) -> Tuple[List[Tuple[BinaryDHTID, Endpoint]], BinaryDHTID]:
														
 
															         """
														
 
															         Someone wants to find :key_node: in the DHT. Give him k nearest neighbors from our routing table
														
 
															+
														
 
															         :returns: a list of pairs (node_id, address) of :bucket_size: nearest to key_node according to XOR distance,
														
 
															          also returns our own node id for routing table maintenance
														
 
															         """
														
@@ -81,6 +83,7 @@ class KademliaProtocol(RPCProtocol):
 
															         """
														
 
															         Ask a recipient to give you nearest neighbors to key_node. If recipient knows key_node directly,
														
 
															          it will be returned as first of the neighbors; if recipient does not respond, return empty dict.
														
 
															+
														
 
															         :returns: a dicitionary[node id => address] as per Section 2.3 of the paper
														
 
															         """
														
 
															         responded, response = await self.find_node(recipient, bytes(self.node_id), bytes(query_id))
														
@@ -97,8 +100,9 @@ class KademliaProtocol(RPCProtocol):
 
															         """
														
 
															         Someone wants to find value corresponding to key. If we have the value, return the value and its expiration time
														
 
															          Either way, return :bucket_size: nearest neighbors to that node.
														
 
															-        :note: this is a deviation from Section 2.3 of the paper, original kademlia returner EITHER value OR neighbors
														
 
															+
														
 
															         :returns: (value or None if we have no value, nearest neighbors, our own dht id)
														
 
															+        :note: this is a deviation from Section 2.3 of the paper, original kademlia returner EITHER value OR neighbors
														
 
															         """
														
 
															         maybe_value, maybe_expiration = self.storage.get(DHTID.from_bytes(key_bytes))
														
 
															         cached_value, cached_expiration = self.cache.get(DHTID.from_bytes(key_bytes))
														
@@ -111,11 +115,12 @@ class KademliaProtocol(RPCProtocol):
 
															             Tuple[Optional[DHTValue], Optional[DHTExpiration], Dict[DHTID, Endpoint]]:
														
 
															         """
														
 
															         Ask a recipient to give you the value, if it has one, or nearest neighbors to your key.
														
 
															+
														
 
															         :returns: (optional value, optional expiration time, and neighbors)
														
 
															          value: whatever was the latest value stored by the recipient with that key (see DHTNode contract)
														
 
															          expiration time: expiration time of the returned value, None if no value was found
														
 
															          neighbors:  a dictionary[node id => address] as per Section 2.3 of the paper;
														
 
															-        Note: if no response, returns None, None, {}
														
 
															+        :note: if no response, returns None, None, {}
														
 
															         """
														
 
															         responded, response = await self.find_value(recipient, bytes(self.node_id), bytes(key))
														
 
															         if responded:
														
@@ -128,6 +133,7 @@ class KademliaProtocol(RPCProtocol):
 
															     async def update_routing_table(self, node_id: Optional[DHTID], addr: Endpoint, responded=True):
														
 
															         """
														
 
															         This method is called on every incoming AND outgoing request to update the routing table
														
 
															+
														
 
															         :param addr: sender endpoint for incoming requests, recipient endpoint for outgoing requests
														
 
															         :param node_id: sender node id for incoming requests, recipient node id for outgoing requests
														
 
															         :param responded: for outgoing requests, this indicated whether recipient responded or not.
														
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -16,6 +16,7 @@ from ..utils import Endpoint, PickleSerializer
 
															 class RoutingTable:
														
 
															     """
														
 
															     A data structure that contains DHT peers bucketed according to their distance to node_id
														
 
															+
														
 
															     :param node_id: node id used to measure distance
														
 
															     :param bucket_size: parameter $k$ from Kademlia paper Section 2.2
														
 
															     :param depth_modulo: parameter $b$ from Kademlia paper Section 2.2.
														
@@ -38,6 +39,7 @@ class RoutingTable:
 
															     def add_or_update_node(self, node_id: DHTID, addr: Endpoint) -> Optional[Tuple[DHTID, Endpoint]]:
														
 
															         """
														
 
															         Update routing table after an incoming request from :addr: (host:port) or outgoing request to :addr:
														
 
															+
														
 
															         :returns: If we cannot add node_id to the routing table, return the least-recently-updated node (Section 2.2)
														
 
															         :note: KademliaProtocol calls this method for every incoming and outgoing request if there was a response.
														
 
															           If this method returned a node to be ping-ed, the protocol will ping it to check and either move it to
														
@@ -81,6 +83,7 @@ class RoutingTable:
 
															             self, query_id: DHTID, k: int, exclude: Optional[DHTID] = None) -> List[Tuple[DHTID, Endpoint]]:
														
 
															         """
														
 
															         Find k nearest neighbors from routing table according to XOR distance, does NOT include self.node_id
														
 
															+
														
 
															         :param query_id: find neighbors of this node
														
 
															         :param k: find this many neighbors. If there aren't enough nodes in the table, returns all nodes
														
 
															         :param exclude: if True, results will not contain query_node_id even if it is in table
														
@@ -147,6 +150,7 @@ class KBucket:
 
															         """
														
 
															         Add node to KBucket or update existing node, return True if successful, False if the bucket is full.
														
 
															         If the bucket is full, keep track of node in a replacement list, per section 4.1 of the paper.
														
 
															+
														
 
															         :param node_id: dht node identifier that should be added or moved to the front of bucket
														
 
															         :param addr: a pair of (hostname, port) associated with that node id
														
 
															         :note: this function has a side-effect of resetting KBucket.last_updated time
														
@@ -225,6 +229,7 @@ class DHTID(int):
 
															     def generate(cls, source: Optional[Any] = None, nbits: int = 255):
														
 
															         """
														
 
															         Generates random uid based on SHA1
														
 
															+
														
 
															         :param source: if provided, converts this value to bytes and uses it as input for hashing function;
														
 
															             by default, generates a random dhtid from :nbits: random bits
														
 
															         """
														
@@ -249,11 +254,13 @@ class DHTID(int):
 
															         return len(os.path.commonprefix(ids_bits))
														
 
															     def to_bytes(self, length=HASH_NBYTES, byteorder='big', *, signed=False) -> bytes:
														
 
															+        """ A standard way to serialize DHTID into bytes """
														
 
															         return super().to_bytes(length, byteorder, signed=signed)
														
 
															     @classmethod
														
 
															-    def from_bytes(self, bytes, byteorder='big', *, signed=False) -> DHTID:
														
 
															-        return DHTID(super().from_bytes(bytes, byteorder=byteorder, signed=signed))
														
 
															+    def from_bytes(cls, raw: bytes, byteorder='big', *, signed=False) -> DHTID:
														
 
															+        """ reverse of to_bytes """
														
 
															+        return DHTID(super().from_bytes(raw, byteorder=byteorder, signed=signed))
														
 
															     def __repr__(self):
														
 
															         return f"{self.__class__.__name__}({hex(self)})"
														
--- a/hivemind/dht/search.py
+++ b/hivemind/dht/search.py
@@ -14,7 +14,7 @@ async def traverse_dht(query_id: DHTID, initial_nodes: Collection[DHTID], k_near
 
															     Approximate time complexity: O(T * log T) where T = (path_to_true_nearest + beam_size) * mean_num_neighbors
														
 
															     :param query_id: search query, find k_nearest neighbors of this DHTID
														
 
															-    :param initial_nodes: nodes used to pre-populate beam search heap, e.g. [my_own_DHTID, *maybe_some_peers]
														
 
															+    :param initial_nodes: nodes used to pre-populate beam search heap, e.g. [my_own_DHTID, ...maybe_some_peers]
														
 
															     :param k_nearest: find up to this many nearest neighbors. If there are less nodes in the DHT, return all nodes
														
 
															     :param beam_size: beam search will not give up until it exhausts this many nearest nodes (to query_id) from the heap
														
 
															         Recommended value: A beam size of k_nearest * (2-5) will yield near-perfect results.
														
--- a/hivemind/runtime/task_pool.py
+++ b/hivemind/runtime/task_pool.py
@@ -97,6 +97,7 @@ class TaskPool(TaskPoolBase):
 
															         return future2
														
 
															     def iterate_minibatches(self, *args, **kwargs):
														
 
															+        """ Form minibatches by grouping one or more tasks together up to self.max_batch_size """
														
 
															         batch = []
														
 
															         total_size = 0