há 4 anos atrás · 339f35f25d
--- a/hivemind/moe/client/balanced_expert.py
+++ b/hivemind/moe/client/balanced_expert.py
@@ -125,8 +125,8 @@ class _BalancedRemoteModuleCall(torch.autograd.Function):
 
				                 break
			
 
				             except KeyboardInterrupt:
			
 
				                 raise
			
 
				-            except BaseException as e:
			
 
				-                logger.exception(f"Tried to call forward for expert {chosen_expert} but caught {repr(e)}")
			
 
				+            except BaseException:
			
 
				+                logger.exception(f"Tried to call forward for expert {chosen_expert}:")
			
 
				 
			
 
				         deserialized_outputs = [deserialize_torch_tensor(tensor) for tensor in outputs.tensors]
			
 
				         return tuple(deserialized_outputs)
			
@@ -149,7 +149,7 @@ class _BalancedRemoteModuleCall(torch.autograd.Function):
 
				                 break
			
 
				             except KeyboardInterrupt:
			
 
				                 raise
			
 
				-            except BaseException as e:
			
 
				-                logger.exception(f"Tried to call backward for expert {chosen_expert} but caught {repr(e)}")
			
 
				+            except BaseException:
			
 
				+                logger.exception(f"Tried to call backward for expert {chosen_expert}:")
			
 
				         deserialized_grad_inputs = [deserialize_torch_tensor(tensor) for tensor in grad_inputs.tensors]
			
 
				         return (DUMMY, None, None, None, None, None, None, *deserialized_grad_inputs)
			
--- a/hivemind/moe/client/balancer.py
+++ b/hivemind/moe/client/balancer.py
@@ -107,6 +107,7 @@ class ExpertBalancer:
 
				                 continue
			
 
				 
			
 
				             with self.lock:
			
 
				+                logger.debug(f"Getting a new expert, queue state: {self.queue}")
			
 
				                 current_runtime, _, uid = heap_entry = heapq.heappop(self.queue)
			
 
				                 maybe_endpoint = self.experts.get(uid)
			
 
				                 if maybe_endpoint is None: