averaging.proto 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. syntax = "proto3";
  2. import "runtime.proto";
  3. // Runs alongside each trainer to perform gating function averaging every now and then. Read more: client/averaging.py
  4. service DecentralizedAveraging {
  5. rpc rpc_group_allreduce(PeerInfo) returns (stream MessageFromLeader); // assemble a group and run all-reduce
  6. rpc rpc_aggregate_part(AveragingData) returns (AveragingData); // send my local shard => get aggregated shard
  7. }
  8. message PeerInfo {
  9. string endpoint = 1; // A follower accepts incoming allreduce requests at this address
  10. bytes schema_hash = 2; // A hash that describes follower's tensors (shapes, num tensors, etc)
  11. double expiration = 3; // Follower would like to **begin** all_reduce by this point in time
  12. }
  13. enum MessageCode {
  14. // response to join request
  15. ACCEPTED = 0; // "I accept you in my group, you will not commit to responding to me."
  16. NOT_A_LEADER = 1; // "I am not a group a leader. Go ask my leader instead."
  17. ALREADY_RUNNING = 2; // "My group has already began merging. Here's the group leader."
  18. NOT_LOOKING_FOR_GROUP = 3; // "I'm not available at the moment. Please, get lost."
  19. BAD_EXPIRATION_TIME = 4; // "I will not accept you. I cannot guarantee that we begin before you expire."
  20. BAD_SCHEMA_HASH = 5; // "I will not accept you. I am not averaging the samy type of tensors as you."
  21. DUPLICATE_ENDPOINT = 6; // "I will not accept you, i already have exactly the same endpoint in my current group"
  22. GROUP_IS_FULL = 7; // "I will not accept you, my group already contains too many peers"
  23. BEGIN_ALLREDUCE = 8; // "We can begin allreduce now. These are your peers."
  24. GROUP_DISBANDED = 9; // "The group is closed. Go find another group."
  25. UNKNOWN_GROUP_ID = 10; // "Your request uses a group id that doesn't match with any group i know"
  26. PROTOCOL_VIOLATION = 11; // "One of peers did something in violation of the allreduce protocol"
  27. INTERNAL_ERROR = 12; // "We encountered an unexpected error on our side"
  28. CANCELLED = 13; // "A peer cancelled allreduce while averaging"
  29. }
  30. message MessageFromLeader {
  31. MessageCode code = 1;
  32. bytes group_id = 2; // a unique identifier of this group, only valid until allreduce is finished/failed
  33. string suggested_leader = 3; // if peer is already in a group, it'll provide us with an endpoint of its leader
  34. repeated string ordered_group_endpoints = 4; // a sequence of peers, each responsible for one shard during averaging
  35. }
  36. message AveragingData {
  37. MessageCode code = 1; // in case of a protocol violation, this will be the error message
  38. bytes group_id = 2; // a unique group identifier, same as in MessageFromLeader
  39. string endpoint = 3; // sender's rpc endpoint, used for coordination
  40. Tensor tensor_part = 4; // either peer's local tensor part (rpc input) or group average of this part (rpc output)
  41. }