瀏覽代碼

Inherit bitsandbytes compute dtype correctly (override peft quirk) (#377)

justheuristic 2 年之前
父節點
當前提交
398a384075
共有 2 個文件被更改,包括 2 次插入2 次删除
  1. 1 2
      src/petals/client/routing/sequence_manager.py
  2. 1 0
      src/petals/utils/peft.py

+ 1 - 2
src/petals/client/routing/sequence_manager.py

@@ -212,7 +212,6 @@ class RemoteSequenceManager:
         end_index: int,
         *,
         cache_tokens_needed: Optional[int],
-        overhead_coeff: float = 1.82,  # Backend overhead (empirically measured)
         overhead_delay: float = 0.018,  # Serialization overhead (empirically measured)
         default_inference_rps: float = 300,  # If inference RPS unknown
         alloc_delay: float = 10,  # If not enough cache left, we penalize the edge
@@ -266,7 +265,7 @@ class RemoteSequenceManager:
                 inference_rps = span.server_info.inference_rps
                 if inference_rps is None:
                     inference_rps = default_inference_rps
-                graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), overhead_coeff / inference_rps)
+                graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), 1.0 / inference_rps)
 
         return graph
 

+ 1 - 0
src/petals/utils/peft.py

@@ -198,6 +198,7 @@ def create_lora_adapter(block, quant_type: QuantType):
                     child.out_features,
                     **kwargs,
                 )
+                lora_wrapped_child.compute_dtype = child.compute_dtype
             else:
                 bias = hasattr(child, "bias") and child.bias is not None
                 lora_wrapped_child = LoraLinear(