2 年之前 · 97dd3c874a
--- a/cli/run_server.py
+++ b/cli/run_server.py
@@ -31,7 +31,7 @@ def main():
 
															     parser.add_argument('--num_handlers', type=int, default=8, required=False,
														
 
															                         help='server will use this many processes to handle incoming requests')
														
 
															     parser.add_argument('--min_batch_size', type=int, default=1,
														
 
															-                        help='Minimum required batch size for all expert operations')
														
 
															+                        help='Minimum required batch size for all operations (in total tokens)')
														
 
															     parser.add_argument('--max_batch_size', type=int, default=16384,
														
 
															                         help='The total number of tokens in the same batch will not exceed this value')
														
 
															     parser.add_argument('--prefetch_batches', type=int, default=1, required=False,
														
@@ -43,7 +43,7 @@ def main():
 
															     parser.add_argument('--cache_dir', type=str, default=None, 
														
 
															                         help='Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.')
														
 
															     parser.add_argument('--device', type=str, default=None, required=False,
														
 
															-                        help='all experts will use this device in torch notation; default: cuda if available else cpu')
														
 
															+                        help='all blocks will use this device in torch notation; default: cuda if available else cpu')
														
 
															     parser.add_argument("--torch_dtype", type=str, default="auto",
														
 
															                         help="Use this dtype to store block weights and do computations. "
														
 
															                              "By default, respect the dtypes in the pre-trained state dict.")
														
@@ -62,7 +62,7 @@ def main():
 
															                              'on the first run and uses these estimates for future runs. '
														
 
															                              'If set to "eval", the script re-evaluates the throughput and overrides the cache.')
														
 
															     parser.add_argument('--update_period', type=float, required=False, default=30,
														
 
															-                        help='Server will report experts to DHT once in this many seconds')
														
 
															+                        help='Server will report blocks to DHT once in this many seconds')
														
 
															     parser.add_argument('--expiration', type=float, required=False, default=None,
														
 
															                         help='DHT entries will expire after this many seconds')
														
 
															     parser.add_argument('--initial_peers', type=str, nargs='*', required=False, default=[],
														
--- a/src/server/backend.py
+++ b/src/server/backend.py
@@ -80,5 +80,5 @@ class TransformerBackend(ModuleBackend):
 
															         return self.forward_pool, self.backward_pool, self.inference_pool
														
 
															     def get_info(self) -> Dict[str, Any]:
														
 
															-        """Get expert parameters and stats. Used by RemoteExpert to check shapes and for DMoE orchestration."""
														
 
															+        """Get module parameters and stats. Used by RemoteExpert to check shapes and for DMoE orchestration."""
														
 
															         return dict(super().get_info(), inference_schema=self.inference_schema)
														
--- a/src/server/server.py
+++ b/src/server/server.py
@@ -71,9 +71,9 @@ class Server(threading.Thread):
 
															         runs Runtime (self.runtime) to process incoming requests.
														
 
															         """
														
 
															         logger.info(f"Serving {len(self.module_backends)} blocks:")
														
 
															-        for expert_name, backend in self.module_backends.items():
														
 
															+        for block_name, backend in self.module_backends.items():
														
 
															             num_parameters = sum(p.numel() for p in backend.module.parameters() if p.requires_grad)
														
 
															-            logger.info(f"{expert_name}: {backend.module.__class__.__name__}, {num_parameters} parameters")
														
 
															+            logger.info(f"{block_name}: {backend.module.__class__.__name__}, {num_parameters} parameters")
														
 
															         if not self.dht.is_alive():
														
 
															             self.dht.run_in_background(await_ready=True)