Browse Source

test&style

Artem Chumachenko 3 năm trước cách đây
mục cha
commit
43e029e248
2 tập tin đã thay đổi với 5 bổ sung20 xóa
  1. 3 7
      hivemind/optim/power_sgd_averager.py
  2. 2 13
      tests/test_optimizer.py

+ 3 - 7
hivemind/optim/power_sgd_averager.py

@@ -126,9 +126,7 @@ class PowerSGDGradientAverager(GradientAverager):
                 for grad, m in zip(averaged_grads, self._ms):
                     m.add_(grad.to(m.device))
 
-                averaged_sgd_ms = [
-                    m for idx, m in enumerate(self._ms) if idx not in self._uncompressed_gradients
-                ]
+                averaged_sgd_ms = [m for idx, m in enumerate(self._ms) if idx not in self._uncompressed_gradients]
                 averaged_sgd_grad = [
                     grad for idx, grad in enumerate(averaged_grads) if idx not in self._uncompressed_gradients
                 ]
@@ -139,9 +137,7 @@ class PowerSGDGradientAverager(GradientAverager):
                 ]
                 for p, q, m in zip(ps, self._qs, averaged_sgd_ms):
                     torch.matmul(m.reshape(-1, q.size(0)), q, out=p)
-                first_all_reduced = ps + [
-                    m for idx, m in enumerate(self._ms) if idx in self._uncompressed_gradients
-                ]
+                first_all_reduced = ps + [m for idx, m in enumerate(self._ms) if idx in self._uncompressed_gradients]
                 allreduce1 = AllReduceRunner(
                     p2p=self._p2p,
                     servicer_type=type(self),
@@ -247,7 +243,7 @@ def orthogonalize(matrix, eps=torch.tensor(1e-8)):
     n, m = matrix.shape
     for i in range(m):
         col = matrix[:, i : i + 1]
-        col /= torch.sqrt(torch.sum(col ** 2)) + eps
+        col /= torch.sqrt(torch.sum(col**2)) + eps
         if i + 1 < m:
             rest = matrix[:, i + 1 :]
             rest -= torch.sum(col * rest, dim=0) * col

+ 2 - 13
tests/test_optimizer.py

@@ -149,21 +149,16 @@ def test_state_averager(offload_optimizer: bool, reuse_tensors: bool, sync_epoch
 
 
 @pytest.mark.forked
-def test_load_state_from_peers(dpu: bool = False):
+def test_load_state_from_peers():
     dht1 = hivemind.DHT(start=True)
     dht2 = hivemind.DHT(initial_peers=dht1.get_visible_maddrs(), start=True)
 
     model1 = nn.Linear(2, 3)
     model2 = nn.Linear(2, 3)
 
-    extras1 = (torch.randn(2, 2), -torch.rand(1))
-    extras2 = (-torch.randn(2, 2), torch.rand(1))
-
     common_kwargs = dict(
         optimizer=partial(torch.optim.SGD, lr=0.1),
         scheduler=partial(torch.optim.lr_scheduler.LambdaLR, lr_lambda=lambda t: 1.0 / max(1, t)),
-        offload_optimizer=dpu,
-        reuse_tensors=dpu,
         target_group_size=2,
         prefix="my_exp",
     )
@@ -173,25 +168,19 @@ def test_load_state_from_peers(dpu: bool = False):
         params=model1.parameters(),
         allow_state_sharing=False,
         start=True,
-        extra_tensors=extras1,
         **common_kwargs,
     )
 
-    avgr2 = TrainingStateAverager(
-        dht=dht2, params=model2.parameters(), start=True, extra_tensors=extras2, **common_kwargs
-    )
+    avgr2 = TrainingStateAverager(dht=dht2, params=model2.parameters(), start=True, **common_kwargs)
 
     avgr2.local_epoch = 1337
     model2.weight.data[...] = 42
-    extras2[0][:] = 9999
     time.sleep(0.1)
 
     avgr1.load_state_from_peers()
     assert avgr1.local_epoch == 1337
     assert torch.all(model1.weight == 42).item()
     assert np.allclose(avgr1.optimizer.param_groups[0]["lr"], 0.1 / 1337)
-    assert torch.all(extras1[0] == extras2[0]).item() and torch.all(extras1[0] == extras2[0]).item()
-    assert torch.all(extras1[0] == 9999).item()
 
 
 @pytest.mark.forked