版本: v1.0.0
日期: 2025-10-09
负责人: Grid Strategy Team
依赖文档: IMPLEMENTATION_PLAN.md, GRID_STRATEGY_DESIGN.md, CONFIG_REFERENCE.md
从实盘日志分析发现以下核心问题:
问题 1: 频繁全撤全布导致成交缺失
2025-10-08 日志分析:
- 运行时长: 20 分钟
- 网格调整次数: 8 次
- 成交订单数: 0 (filledGrids: 0)
- 问题: 每次自适应调整都触发 cancelAll() → 全部档位下线 → 错失成交机会
问题 2: 批量下单触发限流
初始下单: 20 订单,耗时 1.1s (正常)
第 2 次调整: 20 订单,耗时 58s (异常)
第 3 次调整: 20 订单,耗时 119s (严重异常)
根因: 交易所限流,导致每个订单平均延迟 5.9s
问题 3: 网格步长放大导致档位数减少
初始: grid_step=15bps → 13 层 × 2 = 26 订单
调整 1: grid_step=26bps → 7 层 × 2 = 14 订单 (-46%)
调整 2: grid_step=41bps → 4 层 × 2 = 8 订单 (-69%)
结果: 市场覆盖不足,成交概率大幅下降
引入增量网格维护机制,替代"全撤全布"模式:
预期效果:
interface ReconcileInput {
currentGrids: Map<number, GridLevel>; // 现有订单状态
targetPrices: number[]; // 目标价位列表
gridStepBps: number; // 当前步长
minLayers: number; // 最小保留层数
}
interface ReconcileOutput {
toCancel: GridLevel[]; // 需撤销的订单
toModify: GridLevel[]; // 需修改价格的订单
toAdd: number[]; // 需新增的价位
keepAlive: GridLevel[]; // 保持不变的订单
}
async reconcileGrid(targetPrices: number[]): Promise<ReconcileOutput> {
const threshold = this.config.gridStepBps * 0.3; // 偏离阈值 30%
const tickSize = this.tickSize;
const toCancel: GridLevel[] = [];
const toModify: GridLevel[] = [];
const toAdd: number[] = [];
const keepAlive: GridLevel[] = [];
// 1. 分类现有订单
for (const grid of this.grids.values()) {
const nearestTarget = this.findNearest(grid.px, targetPrices);
if (!nearestTarget) {
// 没有对应目标 → 撤销
toCancel.push(grid);
} else {
const deviation = Math.abs(nearestTarget - grid.px);
const deviationBps = (deviation / grid.px) * 10000;
if (deviationBps < threshold) {
// 偏离很小 → 保持
keepAlive.push(grid);
} else if (deviationBps < threshold * 2) {
// 中等偏离 → 修改价格
toModify.push({ ...grid, targetPx: nearestTarget });
} else {
// 偏离过大 → 撤销重挂
toCancel.push(grid);
toAdd.push(nearestTarget);
}
}
}
// 2. 识别新增价位
for (const target of targetPrices) {
const hasExisting = keepAlive.some(g =>
Math.abs(g.px - target) < tickSize
);
const willModify = toModify.some(g =>
Math.abs(g.targetPx! - target) < tickSize
);
if (!hasExisting && !willModify) {
toAdd.push(target);
}
}
return { toCancel, toModify, toAdd, keepAlive };
}
async executeReconcile(plan: ReconcileOutput): Promise<void> {
const batchSize = this.config.incrementalMode.batchSize;
const batchDelay = this.config.incrementalMode.batchIntervalMs;
this.logger.info({
toCancel: plan.toCancel.length,
toModify: plan.toModify.length,
toAdd: plan.toAdd.length,
keepAlive: plan.keepAlive.length
}, 'Executing incremental grid update');
// 1. 优先修改订单(保持流动性)
for (const level of plan.toModify) {
try {
await this.modifyOrReplace(level, level.targetPx!);
} catch (error) {
// 修改失败 → 加入撤销队列
plan.toCancel.push(level);
plan.toAdd.push(level.targetPx!);
}
}
// 2. 批次撤销旧订单
await this.batchCancel(plan.toCancel, batchSize, batchDelay);
// 3. 批次新增订单
await this.batchPlace(plan.toAdd, batchSize, batchDelay);
// 4. 验证最终状态
const finalGrids = this.grids.size;
const expectedGrids = plan.keepAlive.length + plan.toAdd.length;
if (finalGrids < expectedGrids * 0.8) {
this.logger.error({
finalGrids,
expectedGrids,
successRate: (finalGrids / expectedGrids * 100).toFixed(1) + '%'
}, 'Incremental update success rate too low, triggering fallback');
this.incrementalFailures++;
if (this.incrementalFailures >= 3) {
await this.fallbackToFullReset();
}
} else {
this.incrementalFailures = 0;
}
}
async modifyOrder(level: GridLevel, newPx: number): Promise<void> {
if (!this.router.supportsModify) {
throw new Error('Modify not supported');
}
const modifyRequest = {
orderId: level.orderId,
newPx: this.normalizePrice(level.side, newPx),
newSz: level.sz
};
await this.router.modifyLimit(modifyRequest);
// 更新本地状态
level.px = newPx;
level.timestamp = Date.now();
level.dirty = false;
}
async modifyOrReplace(level: GridLevel, newPx: number): Promise<void> {
try {
// 尝试原生修改
await this.modifyOrder(level, newPx);
this.logger.debug({ orderId: level.orderId, newPx }, 'Order modified successfully');
} catch (error) {
// 降级到撤销重挂
this.logger.warn({ error }, 'Modify failed, falling back to cancel+place');
// 标记为 dirty,防止重复操作
level.dirty = true;
// 同一 tick 内完成撤销和重挂
await this.router.cancel(level.orderId!);
this.grids.delete(level.index);
const newOrderId = await this.placeGridOrder(
level.index,
level.side,
newPx,
level.sz
);
level.dirty = false;
this.logger.info({
oldOrderId: level.orderId,
newOrderId,
newPx
}, 'Order replaced successfully');
}
}
async batchCancel(
levels: GridLevel[],
batchSize: number,
delayMs: number
): Promise<void> {
const batches = chunk(levels, batchSize);
for (const [index, batch] of batches.entries()) {
this.logger.debug({
batchIndex: index + 1,
batchSize: batch.length,
totalBatches: batches.length
}, 'Executing cancel batch');
// 并行撤销同一批次内的订单
await Promise.allSettled(
batch.map(level => this.cancelGridLevel(level))
);
// 批次间延迟,避免限流
if (index < batches.length - 1) {
await sleep(delayMs);
}
}
}
async batchPlace(
prices: number[],
batchSize: number,
delayMs: number
): Promise<void> {
const batches = chunk(prices, batchSize);
for (const [index, batch] of batches.entries()) {
this.logger.debug({
batchIndex: index + 1,
batchSize: batch.length,
totalBatches: batches.length
}, 'Executing place batch');
const placePromises = batch.map(async (px) => {
const side = px > this.gridCenter ? 'sell' : 'buy';
const index = this.calculateGridIndex(px);
try {
await this.placeGridOrder(index, side, px, this.baseClipSz);
} catch (error) {
this.logger.warn({ px, error }, 'Failed to place grid order in batch');
}
});
await Promise.allSettled(placePromises);
if (index < batches.length - 1) {
await sleep(delayMs);
}
}
}
interface GridLevel {
index: number;
side: Side;
px: number;
sz: number;
orderId?: string;
clientId?: string;
filled: boolean;
timestamp: number;
// 新增字段
dirty?: boolean; // 正在修改中
pending?: boolean; // 等待确认
retryCount?: number; // 重试次数
targetPx?: number; // 目标价格(用于修改)
}
class GridMaker {
private reconcileLock = false;
private pendingReconcile = false;
async maybeReconcile(): Promise<void> {
if (this.reconcileLock) {
this.pendingReconcile = true;
this.logger.debug('Reconcile in progress, deferring');
return;
}
try {
this.reconcileLock = true;
this.pendingReconcile = false;
await this.reconcileGrid(/* ... */);
} finally {
this.reconcileLock = false;
// 如果有待处理的请求,递归调用
if (this.pendingReconcile) {
setImmediate(() => this.maybeReconcile());
}
}
}
}
class GridMaker {
private incrementalFailures = 0;
private readonly MAX_INCREMENTAL_FAILURES = 3;
async executeReconcile(plan: ReconcileOutput): Promise<void> {
// ... 执行增量更新 ...
const successRate = finalGrids / expectedGrids;
if (successRate < 0.8) {
this.incrementalFailures++;
if (this.incrementalFailures >= this.MAX_INCREMENTAL_FAILURES) {
await this.fallbackToFullReset();
}
} else {
this.incrementalFailures = 0; // 重置计数器
}
}
}
async fallbackToFullReset(): Promise<void> {
this.logger.warn({
failures: this.incrementalFailures,
action: 'fallback_to_full_reset'
}, 'Incremental mode failed, falling back to full reset');
// 临时禁用增量模式
this.incrementalModeEnabled = false;
// 执行全量重置
await this.reset();
// 等待冷却后重新启用
setTimeout(() => {
this.incrementalModeEnabled = true;
this.incrementalFailures = 0;
this.logger.info('Incremental mode re-enabled after cooldown');
}, 60000); // 1 分钟冷却
}
grid:
# ... 现有配置 ...
incremental_mode:
enabled: true # 启用增量模式
modify_threshold_bps: 2 # 偏离阈值(<2bps 保持不变)
batch_size: 5 # 批次大小
batch_interval_ms: 100 # 批次间隔
max_modify_failures: 3 # 修改失败阈值
fallback_full_reset: true # 失败后回退全量模式
fallback_cooldown_ms: 60000 # 回退冷却时间
support_native_modify: false # 交易所是否支持原生 modify
const IncrementalModeSchema = z.object({
enabled: z.boolean().default(true),
modify_threshold_bps: z.number().min(0.5).max(10).default(2),
batch_size: z.number().int().min(1).max(20).default(5),
batch_interval_ms: z.number().min(50).max(1000).default(100),
max_modify_failures: z.number().int().min(1).max(10).default(3),
fallback_full_reset: z.boolean().default(true),
fallback_cooldown_ms: z.number().min(10000).max(300000).default(60000),
support_native_modify: z.boolean().default(false)
});
export const gridIncrementalMetrics = {
// 增量操作统计
reconcile_total: new Counter({
name: 'grid_reconcile_total',
help: 'Total number of reconcile operations'
}),
reconcile_duration_ms: new Histogram({
name: 'grid_reconcile_duration_ms',
help: 'Duration of reconcile operations',
buckets: [100, 500, 1000, 2000, 5000]
}),
// 订单操作分类
orders_kept: new Gauge({
name: 'grid_orders_kept',
help: 'Number of orders kept unchanged'
}),
orders_modified: new Counter({
name: 'grid_orders_modified_total',
help: 'Total number of modified orders'
}),
orders_cancelled: new Counter({
name: 'grid_orders_cancelled_total',
help: 'Total number of cancelled orders'
}),
orders_added: new Counter({
name: 'grid_orders_added_total',
help: 'Total number of newly placed orders'
}),
// 失败与降级
modify_failures: new Counter({
name: 'grid_modify_failures_total',
help: 'Total number of modify operation failures'
}),
fallback_events: new Counter({
name: 'grid_fallback_events_total',
help: 'Total number of fallback to full reset'
}),
// 效率指标
incremental_success_rate: new Gauge({
name: 'grid_incremental_success_rate',
help: 'Success rate of incremental updates (0-1)'
}),
pending_levels: new Gauge({
name: 'grid_pending_levels',
help: 'Number of grid levels in pending state'
})
};
# prometheus/alerts/grid.yml
groups:
- name: grid_incremental
interval: 30s
rules:
- alert: GridIncrementalSuccessRateLow
expr: grid_incremental_success_rate < 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Grid incremental update success rate below 80%"
description: "Success rate: {{ $value | humanizePercentage }}"
- alert: GridModifyFailuresHigh
expr: rate(grid_modify_failures_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High rate of modify failures"
description: "{{ $value }} failures per second"
- alert: GridPendingLevelsStuck
expr: grid_pending_levels > 5
for: 10m
labels:
severity: critical
annotations:
summary: "Grid has stuck pending levels"
description: "{{ $value }} levels stuck in pending state"
- alert: GridFallbackFrequent
expr: rate(grid_fallback_events_total[1h]) > 2
for: 5m
labels:
severity: critical
annotations:
summary: "Frequent fallback to full reset"
description: "{{ $value }} fallbacks in the last hour"
describe('GridMaker.reconcileGrid', () => {
it('should keep unchanged orders when target matches', async () => {
const maker = new GridMaker(/* ... */);
await maker.initialize();
// 目标价位与现有完全一致
const plan = await maker.reconcileGrid(getCurrentPrices());
expect(plan.keepAlive.length).toBe(20);
expect(plan.toCancel.length).toBe(0);
expect(plan.toModify.length).toBe(0);
expect(plan.toAdd.length).toBe(0);
});
it('should modify orders when deviation is moderate', async () => {
const maker = new GridMaker(/* ... */);
await maker.initialize();
// 价格轻微偏移
const targetPrices = getCurrentPrices().map(p => p * 1.001);
const plan = await maker.reconcileGrid(targetPrices);
expect(plan.toModify.length).toBeGreaterThan(0);
expect(plan.toCancel.length).toBeLessThan(plan.toModify.length);
});
it('should cancel and re-place when deviation is large', async () => {
const maker = new GridMaker(/* ... */);
await maker.initialize();
// 价格大幅偏移
const targetPrices = getCurrentPrices().map(p => p * 1.05);
const plan = await maker.reconcileGrid(targetPrices);
expect(plan.toCancel.length).toBe(20);
expect(plan.toAdd.length).toBe(20);
});
});
describe('GridMaker incremental mode E2E', () => {
it('should maintain most orders during volatility adjustment', async () => {
const maker = new GridMaker(config, router, /* ... */);
await maker.initialize();
const initialOrders = maker.grids.size;
// 模拟波动率变化,触发步长调整
volatilityEstimator.inject(20); // hourlyVolBps: 20
await maker.onTick();
// 验证大部分订单保持在线
const onlineOrders = maker.grids.size;
expect(onlineOrders).toBeGreaterThan(initialOrders * 0.8);
});
it('should fallback to full reset after repeated failures', async () => {
const maker = new GridMaker(/* ... */);
// 模拟修改失败
router.modifyLimit = jest.fn().mockRejectedValue(new Error('Rate limit'));
// 触发 3 次失败
for (let i = 0; i < 3; i++) {
await maker.maybeAdjustGridStep();
}
// 验证降级到全量模式
expect(maker.incrementalModeEnabled).toBe(false);
expect(maker.grids.size).toBe(0); // 已全撤
});
});
describe('GridMaker under rate limit stress', () => {
it('should handle rate limits gracefully with batching', async () => {
const maker = new GridMaker(config, router, /* ... */);
// 注入延迟模拟限流
router.sendLimitChild = jest.fn().mockImplementation(async (order) => {
await sleep(200); // 每单 200ms
return 'order-' + Date.now();
});
const start = Date.now();
await maker.initialize(); // 20 订单
const elapsed = Date.now() - start;
// 批次大小 5,间隔 100ms
// 预期: 4 批次 × (5单×200ms + 100ms间隔) ≈ 4400ms
expect(elapsed).toBeLessThan(5000);
expect(elapsed).toBeGreaterThan(4000);
});
});
# 1. 更新配置
vim config/config.yaml
# 2. 金丝雀测试(单实例)
npm run runner -- --mode=dry-run --config=config.yaml
# 3. 观察指标(2-5 分钟)
curl http://localhost:9090/metrics | grep grid_incremental
# 4. 确认无异常后正式启用
npm run runner -- --mode=live
✓ grid_incremental_success_rate > 0.8
✓ grid_pending_levels < 3
✓ grid_modify_failures_total 增长缓慢(< 0.1/s)
✓ grid_fallback_events_total 无增长
✓ grid_reconcile_duration_ms p95 < 2000ms
可能原因:
1. modify_threshold_bps 设置过小,导致大量误判
2. 交易所 API 不稳定,频繁超时
3. 批次间隔过短,触发限流
排查步骤:
1. 检查日志中的 modify/cancel/place 失败原因
2. 调大 modify_threshold_bps 至 3-5 bps
3. 调大 batch_interval_ms 至 200-300ms
4. 如持续失败,临时禁用增量模式
可能原因:
1. 并发锁未正确释放
2. 订单状态更新延迟
3. 回调未触发(orderId 不匹配)
排查步骤:
1. 重启 runner,清空状态
2. 检查 WS 订单更新回调是否正常
3. 启用 debug 日志,追踪状态流转
可能原因:
1. 网络质量差,订单操作失败率高
2. 配置错误(如 batch_size 过大)
3. 交易所限流策略调整
排查步骤:
1. 检查网络延迟 (ping 交易所)
2. 降低 batch_size 至 3
3. 增加 batch_interval_ms 至 500ms
4. 联系交易所确认限流策略
# 方法 1: 配置热更新
# 修改 config.yaml
incremental_mode:
enabled: false
# 触发热重载
kill -HUP <runner_pid>
# 方法 2: 重启服务
npm run runner:stop
# 修改配置
npm run runner:start
# 方法 3: 使用备份配置
npm run runner -- --config=config/rollback/safe_mode.yaml
reconcileGrid() 核心逻辑modifyOrReplace() 降级机制packages/strategies/src/gridMaker.ts
├── reconcileGrid() # 核心差分算法
├── modifyOrReplace() # 修改/替换逻辑
├── batchCancel() # 批次撤销
├── batchPlace() # 批次新增
└── fallbackToFullReset() # 降级处理
packages/execution/src/orderRouter.ts
└── modifyLimit() # 订单修改接口(待实现)
packages/telemetry/src/gridMetrics.ts
└── gridIncrementalMetrics # 增量模式指标
IMPLEMENTATION_PLAN.md - M1.6 总体规划GRID_STRATEGY_DESIGN.md - 网格策略原理CONFIG_REFERENCE.md - 配置参数详解OPERATIONS_PLAYBOOK.md - 运维手册