Forráskód Böngészése

Make the first retry delay be zero

Aleksandr Borzunov 2 éve
szülő
commit
b278a8d5f1

+ 1 - 1
src/client/inference_session.py

@@ -278,7 +278,7 @@ class InferenceSession:
                     block_idx = span.end
                     break
                 except Exception as e:
-                    delay = self._sequence_manager.min_backoff * 2**attempt_no
+                    delay = self._sequence_manager.get_retry_delay(attempt_no)
                     logger.warning(
                         f"Caught exception when running inference from block {block_idx} "
                         f"(retry in {delay:.0f} sec): {repr(e)}"

+ 5 - 0
src/client/sequence_manager.py

@@ -160,3 +160,8 @@ class RemoteSequenceManager:
                     else:
                         logger.warning(f"Tried to call rpc_info, but caught {repr(e)}", exc_info=True)
         return self._rpc_info
+
+    def get_retry_delay(self, attempt_no: int) -> float:
+        if attempt_no == 0:
+            return 0
+        return self.min_backoff * 2 ** (attempt_no - 1)

+ 2 - 2
src/client/sequential_autograd.py

@@ -81,7 +81,7 @@ async def sequential_forward(
                 block_idx = span.end
                 break
             except Exception as e:
-                delay = sequence_manager.min_backoff * 2**attempt_no
+                delay = sequence_manager.get_retry_delay(attempt_no)
                 logger.warning(
                     f"Caught exception when running forward from block {block_idx} "
                     f"(retry in {delay:.0f} sec): {repr(e)}"
@@ -141,7 +141,7 @@ async def sequential_backward(
                 grad_prompts_reversed.extend(span_grad_prompts)
                 break
             except Exception as e:
-                delay = sequence_manager.min_backoff * 2**attempt_no
+                delay = sequence_manager.get_retry_delay(attempt_no)
                 logger.warning(
                     f"Caught exception when running backward between blocks {span.start}-{span.end} "
                     f"(retry in {delay:.0f} sec): {repr(e)}"