Explorar el Código

Extra logging

Max Ryabinin hace 3 años
padre
commit
339f35f25d
Se han modificado 2 ficheros con 5 adiciones y 4 borrados
  1. 4 4
      hivemind/moe/client/balanced_expert.py
  2. 1 0
      hivemind/moe/client/balancer.py

+ 4 - 4
hivemind/moe/client/balanced_expert.py

@@ -125,8 +125,8 @@ class _BalancedRemoteModuleCall(torch.autograd.Function):
                 break
             except KeyboardInterrupt:
                 raise
-            except BaseException as e:
-                logger.exception(f"Tried to call forward for expert {chosen_expert} but caught {repr(e)}")
+            except BaseException:
+                logger.exception(f"Tried to call forward for expert {chosen_expert}:")
 
         deserialized_outputs = [deserialize_torch_tensor(tensor) for tensor in outputs.tensors]
         return tuple(deserialized_outputs)
@@ -149,7 +149,7 @@ class _BalancedRemoteModuleCall(torch.autograd.Function):
                 break
             except KeyboardInterrupt:
                 raise
-            except BaseException as e:
-                logger.exception(f"Tried to call backward for expert {chosen_expert} but caught {repr(e)}")
+            except BaseException:
+                logger.exception(f"Tried to call backward for expert {chosen_expert}:")
         deserialized_grad_inputs = [deserialize_torch_tensor(tensor) for tensor in grad_inputs.tensors]
         return (DUMMY, None, None, None, None, None, None, *deserialized_grad_inputs)

+ 1 - 0
hivemind/moe/client/balancer.py

@@ -107,6 +107,7 @@ class ExpertBalancer:
                 continue
 
             with self.lock:
+                logger.debug(f"Getting a new expert, queue state: {self.queue}")
                 current_runtime, _, uid = heap_entry = heapq.heappop(self.queue)
                 maybe_endpoint = self.experts.get(uid)
                 if maybe_endpoint is None: