|
@@ -29,14 +29,13 @@ def test_remote_module_call():
|
|
|
probs = torch.softmax(logits, 0)
|
|
|
outs = [expert(xx) for expert in experts[:3]]
|
|
|
manual_output = sum(p * x for p, x in zip(probs, outs))
|
|
|
- print(f'ref {[manual_output.min(), manual_output.max(), manual_output.norm()]}')
|
|
|
grad_xx_manual, = torch.autograd.grad(torch.sum(random_proj * manual_output), xx, retain_graph=True)
|
|
|
grad_xx_manual_rerun, = torch.autograd.grad(torch.sum(random_proj * manual_output), xx, retain_graph=True)
|
|
|
grad_logits_manual, = torch.autograd.grad(torch.sum(random_proj * manual_output), logits, retain_graph=True)
|
|
|
|
|
|
- assert torch.allclose(moe_output, manual_output), "_RemoteMoECall returned incorrect output"
|
|
|
assert torch.allclose(grad_xx_manual, grad_xx_manual_rerun), "Experts are non-deterministic. This test is only " \
|
|
|
"valid for deterministic experts"
|
|
|
+ assert torch.allclose(moe_output, manual_output, rtol=1e-3, atol=1e-6), "_RemoteMoECall returned incorrect output"
|
|
|
assert torch.allclose(grad_xx_moe, grad_xx_manual, rtol=1e-3, atol=1e-6), "incorrect gradient w.r.t. input"
|
|
|
assert torch.allclose(grad_logits_moe, grad_logits_manual, rtol=1e-3, atol=1e-6), "incorrect gradient w.r.t. logits"
|
|
|
|