5 years ago · 68675b255c
--- a/hivemind/dht/node.py
+++ b/hivemind/dht/node.py
@@ -1,4 +1,5 @@
 
															 import asyncio
														
 
															+import random
														
 
															 from collections import OrderedDict
														
 
															 from functools import partial
														
 
															 from typing import Optional, Tuple, List, Dict
														
@@ -23,9 +24,16 @@ class DHTNode:
 
															       Recommended value: $k$ is chosen s.t. any given k nodes are very unlikely to all fail after staleness_timeout
														
 
															     :param num_replicas: (≈k) - number of nearest nodes that will be asked to store a given key, default = bucket_size
														
 
															     :param depth_modulo: (b) - kademlia can split bucket if it contains root OR up to the nearest multiple of this value
														
 
															+    :param max_concurrent_rpc: maximum number of outgoing RPC requests emitted by KademliaProtocol in parallel
														
 
															+        Reduce this value if your RPC requests register no response despite the peer sending the response.
														
 
															     :param wait_timeout: a kademlia rpc request is deemed lost if we did not recieve a reply in this many seconds
														
 
															     :param staleness_timeout: a bucket is considered stale if no node from that bucket was updated in this many seconds
														
 
															+        if staleness_timeout is None, DHTNode will not refresh stale buckets (which is usually okay)
														
 
															     :param bootstrap_timeout: after one of peers responds, await other peers for at most this many seconds
														
 
															+    :param cache_locally: if True, caches all values (stored or found) in a node-local cache
														
 
															+    :param cache_nearest: if above 0, whenever DHTNode finds a value, it will also store (cache) this value on this many
														
 
															+        nodes nearest nodes visited by search algorithm. Prefers nodes that are nearest to :key: but have no value yet.
														
 
															+    :param cache_size: if specified, local cache will store up to this many records (as in LRU cache)
														
 
															     :param interface: provide 0.0.0.0 to operate over ipv4, :: to operate over ipv6, localhost to operate locally, etc.
														
 
															     :note: Hivemind DHT is optimized to store temporary metadata that is regularly updated.
														
@@ -47,9 +55,9 @@ class DHTNode:
 
															     def __init__(self, node_id: Optional[DHTID] = None, port: Optional[Port] = None, initial_peers: List[Endpoint] = (),
														
 
															                  bucket_size: int = 20, num_replicas: Optional[int] = None, depth_modulo: int = 5,
														
 
															-                 wait_timeout: float = 5, staleness_timeout: Optional[float] = 600,
														
 
															+                 max_concurrent_rpc: int = 128, wait_timeout: float = 5, staleness_timeout: Optional[float] = None,
														
 
															                  bootstrap_timeout: Optional[float] = None, cache_locally: bool = True, cache_nearest: int = 1,
														
 
															-                 interface: Hostname = '0.0.0.0'):
														
 
															+                 cache_size=None, interface: Hostname = '0.0.0.0'):
														
 
															         self.node_id = node_id = node_id if node_id is not None else DHTID.generate()
														
 
															         self.port = port = port if port is not None else find_open_port()
														
 
															         self.num_replicas = num_replicas if num_replicas is not None else bucket_size
														
@@ -58,34 +66,40 @@ class DHTNode:
 
															         # create kademlia protocol and make it listen to a port
														
 
															         loop = asyncio.get_event_loop()
														
 
															-        make_protocol = partial(KademliaProtocol, self.node_id, bucket_size, depth_modulo, wait_timeout)
														
 
															+        make_protocol = partial(KademliaProtocol, self.node_id, bucket_size, depth_modulo, wait_timeout,
														
 
															+                                max_concurrent_rpc, num_replicas, cache_size)
														
 
															         listener = loop.run_until_complete(loop.create_datagram_endpoint(make_protocol, local_addr=(interface, port)))
														
 
															         self.transport: asyncio.Transport = listener[0]
														
 
															         self.protocol: KademliaProtocol = listener[1]
														
 
															         if initial_peers:
														
 
															-            # bootstrap part 1: ping initial_peers, add each other to the routing table
														
 
															+            # stage 1: ping initial_peers, add each other to the routing table
														
 
															             bootstrap_timeout = bootstrap_timeout if bootstrap_timeout is not None else wait_timeout
														
 
															-            began_bootstrap_time = get_dht_time()
														
 
															+            start_time = get_dht_time()
														
 
															             ping_tasks = map(self.protocol.call_ping, initial_peers)
														
 
															-            finished_tasks, remaining_tasks = loop.run_until_complete(
														
 
															-                asyncio.wait(ping_tasks, timeout=wait_timeout, return_when=asyncio.FIRST_COMPLETED))
														
 
															-            time_to_first_response = get_dht_time() - began_bootstrap_time
														
 
															-            # bootstrap part 2: gather all peers who responded within bootstrap_timeout, but at least one peer
														
 
															-            if remaining_tasks:
														
 
															+            finished_ping_tasks, remaining_ping_tasks = loop.run_until_complete(
														
 
															+                asyncio.wait(ping_tasks, return_when=asyncio.FIRST_COMPLETED))
														
 
															+
														
 
															+            # stage 2: gather remaining peers (those who respond within bootstrap_timeout)
														
 
															+            if remaining_ping_tasks:
														
 
															                 finished_in_time, stragglers = loop.run_until_complete(
														
 
															-                    asyncio.wait(remaining_tasks, timeout=bootstrap_timeout - time_to_first_response))
														
 
															+                    asyncio.wait(remaining_ping_tasks, timeout=bootstrap_timeout - get_dht_time() + start_time))
														
 
															                 for straggler in stragglers:
														
 
															                     straggler.cancel()
														
 
															-                finished_tasks |= finished_in_time
														
 
															+                finished_ping_tasks |= finished_in_time
														
 
															-            peer_ids = [task.result() for task in finished_tasks if task.result() is not None]
														
 
															-            if len(peer_ids) == 0 and len(initial_peers) != 0:
														
 
															+            if not finished_ping_tasks:
														
 
															                 warn("DHTNode bootstrap failed: none of the initial_peers responded to a ping.")
														
 
															-            # bootstrap part 3: run beam search for my node id to add my own nearest neighbors to the routing table
														
 
															-            # ... and maybe receive some values that we are meant to store (see protocol.update_routing_table)
														
 
															-            loop.run_until_complete(self.find_nearest_nodes(query_id=self.node_id))
														
 
															+            # stage 3: traverse dht to find my own nearest neighbors and populate the routing table
														
 
															+            # ... maybe receive some values that we are meant to store (see protocol.update_routing_table)
														
 
															+            # note: using asyncio.wait instead of wait_for because wait_for cancels task on timeout
														
 
															+            loop.run_until_complete(asyncio.wait([loop.create_task(self.find_nearest_nodes(query_id=self.node_id)),
														
 
															+                                                  asyncio.sleep(bootstrap_timeout - get_dht_time() + start_time)],
														
 
															+                                                 return_when=asyncio.FIRST_COMPLETED))
														
 
															+
														
 
															+        if self.staleness_timeout is not None:
														
 
															+            loop.create_task(self._refresh_routing_table(period=self.staleness_timeout))
														
 
															     async def find_nearest_nodes(self, query_id: DHTID, k_nearest: Optional[int] = None,
														
 
															                                  beam_size: Optional[int] = None, exclude_self: bool = False) -> Dict[DHTID, Endpoint]:
														
@@ -207,3 +221,16 @@ class DHTNode:
 
															                     break
														
 
															         return (latest_value, latest_expiration) if latest_expiration != -float('inf') else (None, None)
														
 
															+
														
 
															+    async def _refresh_routing_table(self, *, period: Optional[float]) -> None:
														
 
															+        """ Tries to find new nodes for buckets that were unused for more than self.staleness_timeout """
														
 
															+        while period is not None:  # if None run once, otherwise run forever
														
 
															+            refresh_time = get_dht_time()
														
 
															+            staleness_threshold = refresh_time - self.staleness_timeout
														
 
															+            stale_buckets = [bucket for bucket in self.protocol.routing_table.buckets
														
 
															+                             if bucket.last_updated < staleness_threshold]
														
 
															+            for bucket in stale_buckets:
														
 
															+                refresh_id = DHTID(random.randint(bucket.lower, bucket.upper - 1))
														
 
															+                await self.find_nearest_nodes(refresh_id)
														
 
															+
														
 
															+            await asyncio.sleep(max(0.0, period - (get_dht_time() - refresh_time)))
														
--- a/hivemind/dht/protocol.py
+++ b/hivemind/dht/protocol.py
@@ -1,6 +1,6 @@
 
															 import asyncio
														
 
															 import heapq
														
 
															-from typing import Optional, List, Tuple, Dict
														
 
															+from typing import Optional, List, Tuple, Dict, Iterator
														
 
															 from rpcudp.protocol import RPCProtocol
														
 
															 from .routing import RoutingTable, DHTID, DHTValue, DHTExpiration, BinaryDHTID, get_dht_time
														
@@ -21,10 +21,11 @@ class KademliaProtocol(RPCProtocol):
 
															      Read more: https://github.com/bmuller/rpcudp/tree/master/rpcudp
														
 
															     """
														
 
															-    def __init__(self, node_id: DHTID, bucket_size: int, depth_modulo: int,
														
 
															-                 wait_timeout: float, cache_size: Optional[int] = None):
														
 
															+    def __init__(self, node_id: DHTID, bucket_size: int, depth_modulo: int, wait_timeout: float,
														
 
															+                 max_concurrent_rpc: int, num_replicas: Optional[int] = None, cache_size: Optional[int] = None):
														
 
															         super().__init__(wait_timeout)
														
 
															-        self.node_id, self.bucket_size = node_id, bucket_size
														
 
															+        self.node_id, self.bucket_size, self.num_replicas = node_id, bucket_size, num_replicas or bucket_size
														
 
															+        self.rpc_semaphore = asyncio.BoundedSemaphore(value=max_concurrent_rpc)
														
 
															         self.routing_table = RoutingTable(node_id, bucket_size, depth_modulo)
														
 
															         self.storage = LocalStorage()
														
 
															         self.cache = LocalStorage(maxsize=cache_size)
														
@@ -36,7 +37,8 @@ class KademliaProtocol(RPCProtocol):
 
															     async def call_ping(self, recipient: Endpoint) -> Optional[DHTID]:
														
 
															         """ Get recipient's node id and add him to the routing table. If recipient doesn't respond, return None """
														
 
															-        responded, response = await self.ping(recipient, bytes(self.node_id))
														
 
															+        async with self.rpc_semaphore:
														
 
															+            responded, response = await self.ping(recipient, bytes(self.node_id))
														
 
															         recipient_node_id = DHTID.from_bytes(response) if responded else None
														
 
															         asyncio.ensure_future(self.update_routing_table(recipient_node_id, recipient, responded=responded))
														
 
															         return recipient_node_id
														
@@ -58,8 +60,9 @@ class KademliaProtocol(RPCProtocol):
 
															         :returns: True if value was accepted, False if it was rejected (recipient has newer value), None if no response
														
 
															         """
														
 
															-        responded, response = await self.store(recipient, bytes(self.node_id), bytes(key),
														
 
															-                                               value, expiration_time, in_cache)
														
 
															+        async with self.rpc_semaphore:
														
 
															+            responded, response = await self.store(recipient, bytes(self.node_id), bytes(key),
														
 
															+                                                   value, expiration_time, in_cache)
														
 
															         if responded:
														
 
															             store_accepted, recipient_node_id = response[0], DHTID.from_bytes(response[1])
														
 
															             asyncio.ensure_future(self.update_routing_table(recipient_node_id, recipient, responded=responded))
														
@@ -86,7 +89,8 @@ class KademliaProtocol(RPCProtocol):
 
															         :returns: a dicitionary[node id => address] as per Section 2.3 of the paper
														
 
															         """
														
 
															-        responded, response = await self.find_node(recipient, bytes(self.node_id), bytes(query_id))
														
 
															+        async with self.rpc_semaphore:
														
 
															+            responded, response = await self.find_node(recipient, bytes(self.node_id), bytes(query_id))
														
 
															         if responded:
														
 
															             peers = {DHTID.from_bytes(peer_id_bytes): tuple(addr) for peer_id_bytes, addr in response[0]}
														
 
															             # Note: we convert addr from list to tuple here --^ because some msgpack versions convert tuples to lists
														
@@ -122,7 +126,8 @@ class KademliaProtocol(RPCProtocol):
 
															          neighbors:  a dictionary[node id => address] as per Section 2.3 of the paper;
														
 
															         :note: if no response, returns None, None, {}
														
 
															         """
														
 
															-        responded, response = await self.find_value(recipient, bytes(self.node_id), bytes(key))
														
 
															+        async with self.rpc_semaphore:
														
 
															+            responded, response = await self.find_value(recipient, bytes(self.node_id), bytes(key))
														
 
															         if responded:
														
 
															             (value, expiration_time, peers_bytes), recipient_id = response[:-1], DHTID.from_bytes(response[-1])
														
 
															             peers = {DHTID.from_bytes(peer_id_bytes): tuple(addr) for peer_id_bytes, addr in peers_bytes}
														
@@ -140,11 +145,23 @@ class KademliaProtocol(RPCProtocol):
 
															           For incoming requests, this should always be True
														
 
															         """
														
 
															         if responded:  # incoming request or outgoing request with response
														
 
															+            if node_id not in self.routing_table:
														
 
															+                # we just met a new node, maybe we know some values that it *should* store
														
 
															+                for key, value, expiration in list(self.storage.items()):
														
 
															+                    neighbors = self.routing_table.get_nearest_neighbors(key, self.num_replicas, exclude=self.node_id)
														
 
															+                    if neighbors:
														
 
															+                        nearest_distance = neighbors[0][0].xor_distance(key)
														
 
															+                        farthest_distance = neighbors[-1][0].xor_distance(key)
														
 
															+                        new_node_should_store = node_id.xor_distance(key) < farthest_distance
														
 
															+                        this_node_is_responsible = self.node_id.xor_distance(key) < nearest_distance
														
 
															+                    if not neighbors or (new_node_should_store and this_node_is_responsible):
														
 
															+                        asyncio.create_task(self.call_store(addr, key, value, expiration))
														
 
															+
														
 
															             maybe_node_to_ping = self.routing_table.add_or_update_node(node_id, addr)
														
 
															             if maybe_node_to_ping is not None:
														
 
															                 # we couldn't add new node because the table was full. Check if existing peers are alive (Section 2.2)
														
 
															                 # ping one least-recently updated peer: if it won't respond, remove it from the table, else update it
														
 
															-                await self.call_ping(maybe_node_to_ping[1])  # [1]-th element is that node's endpoint
														
 
															+                asyncio.create_task(self.call_ping(maybe_node_to_ping[1]))  # [1]-th element is that node's endpoint
														
 
															         else:  # outgoing request and peer did not respond
														
 
															             if node_id is not None and node_id in self.routing_table:
														
@@ -160,7 +177,6 @@ class KademliaProtocol(RPCProtocol):
 
															             super()._accept_response(msg_id, data, address)
														
 
															-
														
 
															 class LocalStorage:
														
 
															     def __init__(self, maxsize: Optional[int] = None):
														
 
															         self.cache_size = maxsize or float("inf")
														
@@ -200,3 +216,8 @@ class LocalStorage:
 
															         if key in self.data:
														
 
															             return self.data[key]
														
 
															         return None, None
														
 
															+
														
 
															+    def items(self) -> Iterator[Tuple[DHTID, DHTValue, DHTExpiration]]:
														
 
															+        """ Iterate over (key, value, expiration_time) tuples stored in this storage """
														
 
															+        self.remove_outdated()
														
 
															+        return ((key, value, expiration) for key, (value, expiration) in self.data.items())
														
--- a/hivemind/dht/routing.py
+++ b/hivemind/dht/routing.py
@@ -193,6 +193,9 @@ class KBucket:
 
															                 newnode_id, newnode = self.replacement_nodes.popitem()
														
 
															                 self.nodes_to_addr[newnode_id] = newnode
														
 
															+    def __contains__(self, node_id: DHTID):
														
 
															+        return node_id in self.nodes_to_addr or node_id in self.replacement_nodes
														
 
															+
														
 
															     def __len__(self):
														
 
															         return len(self.nodes_to_addr)
														
--- a/hivemind/runtime/task_pool.py
+++ b/hivemind/runtime/task_pool.py
@@ -55,7 +55,7 @@ class TaskPool(TaskPoolBase):
 
															     to process these batches and dispatches results back to request sources. Operates as a background process.
														
 
															     :param process_func: function to be applied to every formed batch; called by Runtime
														
 
															-        Note that process_func should accept only \*args Tensors and return a flat tuple of Tensors
														
 
															+        Note that process_func should accept only positional args (Tensors) and return a flat tuple of Tensors
														
 
															     :param max_batch_size: process at most this many inputs in a batch (task contains have one or several inputs)
														
 
															     :param min_batch_size: process at least this many inputs in a batch, otherwise wait for more
														
 
															     :param timeout: wait for a subsequent task for at most this many seconds
														
--- a/tests/test_dht.py
+++ b/tests/test_dht.py
@@ -20,7 +20,7 @@ from hivemind.dht.protocol import LocalStorage
 
															 def run_protocol_listener(port: int, dhtid: DHTID, started: mp.synchronize.Event,
														
 
															                           ping: Optional[hivemind.Endpoint] = None):
														
 
															     loop = asyncio.new_event_loop()
														
 
															-    protocol = partial(KademliaProtocol, dhtid, bucket_size=20, depth_modulo=5, wait_timeout=5)
														
 
															+    protocol = partial(KademliaProtocol, dhtid, bucket_size=20, depth_modulo=5, wait_timeout=5, max_concurrent_rpc=128)
														
 
															     listen = loop.create_datagram_endpoint(protocol, local_addr=('127.0.0.1', port))
														
 
															     transport, protocol = loop.run_until_complete(listen)
														
 
															     print(f"Started peer id={protocol.node_id} port={port}", flush=True)
														
@@ -47,7 +47,8 @@ def test_kademlia_protocol():
 
															         port = hivemind.find_open_port()
														
 
															         loop = asyncio.new_event_loop()
														
 
															-        protocol = partial(KademliaProtocol, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5)
														
 
															+        protocol = partial(KademliaProtocol, DHTID.generate(), bucket_size=20, depth_modulo=5, wait_timeout=5,
														
 
															+                           max_concurrent_rpc=128)
														
 
															         listen = loop.create_datagram_endpoint(protocol, local_addr=('127.0.0.1', port))
														
 
															         transport, protocol = loop.run_until_complete(listen)
														
 
															         print(f"Self id={protocol.node_id} port={port}", flush=True)