5
0
Эх сурвалжийг харах

Hotfix: Increase daemon_startup_timeout (#292)

For some reasons, right now 15 sec is not enough to connect to the bootstrap peers in the public swarm, as reported by multiple users and observed by me. Increasing it to 120 sec until we find the root cause of the issue.
Alexander Borzunov 2 жил өмнө
parent
commit
e0cef73757

+ 5 - 0
src/petals/cli/run_server.py

@@ -47,6 +47,9 @@ def main():
     parser.add_argument('--announce_maddrs', nargs='+', required=False,
                         help='Visible multiaddrs the host announces for external connections from other peers')
 
+    parser.add_argument('--daemon_startup_timeout', type=float, default=120,
+                        help='Timeout for the libp2p daemon connecting to initial peers')
+
     parser.add_argument('--compression', type=str, default='NONE', required=False, help='Tensor compression communication')
 
     parser.add_argument('--num_handlers', type=int, default=8, required=False,
@@ -167,6 +170,8 @@ def main():
         assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)"
         announce_maddrs = [f"/ip4/{public_ip}/tcp/{port}"]
 
+    args["startup_timeout"] = args.pop("daemon_startup_timeout")
+
     if args.pop("increase_file_limit"):
         increase_file_limit()
 

+ 1 - 1
src/petals/client/remote_model.py

@@ -32,7 +32,7 @@ class DistributedBloomConfig(BloomConfig):
 
     initial_peers: List[str] = PUBLIC_INITIAL_PEERS  # a list of initial peers for hivemind DHT
     dht_prefix: str  # a prefix for all dht keys that correspond to this model (usually equal to model name)
-    daemon_startup_timeout: int = 30
+    daemon_startup_timeout: int = 120  # timeout for the libp2p daemon connecting to initial peers
     dht: Optional[hivemind.DHT] = None  # a running DHT instance, e.g. when using the same DHT for multiple models
     request_timeout: int = 3 * 60  # a number of seconds for waiting result from each node
     max_retries: Optional[int] = None  # max number retries before the client raises an exception (default: inf)