test_averaging.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import asyncio
  2. import random
  3. import time
  4. import torch
  5. import pytest
  6. import hivemind
  7. from hivemind.client.averaging.allreduce import AllReduceProtocol, split_into_parts, restore_from_parts
  8. from hivemind.utils import Endpoint
  9. @pytest.mark.forked
  10. def test_getset_averagers():
  11. dht = hivemind.DHT(start=True)
  12. t = hivemind.get_dht_time()
  13. dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 60)
  14. dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', expiration_time=t + 61)
  15. q1 = dht.get_averagers('bucket.0b10110', only_active=True)
  16. dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 66)
  17. q2 = dht.get_averagers('bucket.0b10110', only_active=True)
  18. dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', looking_for_group=False,
  19. expiration_time=t + 61)
  20. q3 = dht.get_averagers('bucket.0b10110', only_active=True)
  21. q4 = dht.get_averagers('bucket.0b10110', only_active=False)
  22. assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2', t + 61) in q1
  23. assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2', t + 61) in q2
  24. assert len(q3) == 1 and ('localhvost', t + 66) in q3
  25. assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2', t + 61) in q2
  26. @pytest.mark.forked
  27. @pytest.mark.asyncio
  28. async def test_allreduce_once():
  29. dht = hivemind.DHT(start=True)
  30. tensors1 = [torch.randn(123), torch.zeros(3)]
  31. tensors2 = [torch.rand(123), torch.ones(3)]
  32. tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)]
  33. tensors4 = [torch.randn(123) ** 3, torch.arange(3).to(torch.float32) / 2]
  34. reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4 for i in range(len(tensors1))]
  35. averagers = [hivemind.DecentralizedAverager(tensors, dht=dht, target_group_size=4, averaging_expiration=15,
  36. prefix='mygroup', initial_group_bits='0110', listen_on='127.0.0.1:*',
  37. start=True)
  38. for tensors in [tensors1, tensors2, tensors3, tensors4]]
  39. futures = []
  40. for averager in averagers:
  41. futures.append(averager.step(return_future=True)) # TODO revert to hard version
  42. time.sleep(0.5)
  43. for future in futures:
  44. for ref, our in zip(reference, future.result()):
  45. assert torch.allclose(ref, our)
  46. @pytest.mark.forked
  47. @pytest.mark.asyncio
  48. async def test_allreduce_protocol():
  49. """ Run group allreduce protocol manually without grpc, see if the internal logic is working as intended """
  50. peers = "alice", "bob", "carol"
  51. tensors_by_peer = {peer: [torch.randn(3, 128), torch.rand(32), torch.tensor(i, dtype=torch.float32)]
  52. for i, peer in enumerate(peers)}
  53. group_id = random.getrandbits(160).to_bytes(length=20, byteorder='big')
  54. allreduce_protocols = [AllReduceProtocol(
  55. group_id=group_id, endpoint=peer, tensors=tensors_by_peer[peer], ordered_group_endpoints=peers)
  56. for peer in peers]
  57. async def _accumulate(sender: Endpoint, recipient: Endpoint):
  58. sender_allreduce = allreduce_protocols[peers.index(sender)]
  59. recipient_allreduce = allreduce_protocols[peers.index(recipient)]
  60. averaged_part = await recipient_allreduce.accumulate_part(
  61. source=sender, remote_part=sender_allreduce.local_tensor_parts[recipient])
  62. sender_allreduce.register_averaged_part(source=recipient, averaged_part=averaged_part)
  63. await asyncio.wait({_accumulate(sender, recipient) for sender in peers for recipient in peers
  64. if sender != recipient})
  65. reference_tensors = [
  66. sum(tensors_by_peer[peer][i] for peer in peers) / len(peers)
  67. for i in range(len(tensors_by_peer[peers[0]]))
  68. ]
  69. for peer, allreduce in zip(peers, allreduce_protocols):
  70. assert allreduce.averaged_tensors.done()
  71. averaged_tensors = await allreduce
  72. assert len(averaged_tensors) == len(reference_tensors)
  73. assert all(torch.allclose(our, ref, atol=1e-6, rtol=0)
  74. for our, ref in zip(averaged_tensors, reference_tensors))
  75. @pytest.mark.forked
  76. def test_chunks():
  77. for _ in range(100):
  78. tensors = []
  79. for _ in range(random.randint(1, 5)):
  80. ndim = random.randint(0, 4)
  81. shape = torch.Size([random.randint(0, 16) for _ in range(ndim)])
  82. make_tensor = random.choice([torch.rand, torch.randn, torch.zeros, torch.ones])
  83. tensors.append(make_tensor(shape))
  84. total_size = sum(map(torch.Tensor.numel, tensors))
  85. if total_size == 0:
  86. continue
  87. num_chunks = random.randint(1, min(1000, sum(x.numel() for x in tensors)))
  88. chunks = split_into_parts(tensors, group_size=num_chunks)
  89. assert len(chunks) == num_chunks
  90. shapes = [tensor.shape for tensor in tensors]
  91. restored = restore_from_parts(chunks, shapes)
  92. assert len(restored) == len(tensors)
  93. assert all(new.shape == old.shape for new, old in zip(restored, tensors))
  94. assert all(torch.allclose(new, old) for new, old in zip(restored, tensors))