Aleksandr Borzunov 3 жил өмнө
parent
commit
8cf3b43e5a
2 өөрчлөгдсөн 10 нэмэгдсэн , 8 устгасан
  1. 5 3
      arguments.py
  2. 5 5
      manage_scaleset.py

+ 5 - 3
arguments.py

@@ -27,7 +27,6 @@ class HFTrainerArguments(TrainingArguments):
     clamp_value: float = 10000.0
 
     fp16: bool = False
-    fp16_opt_level: str = "O2"
     do_train: bool = True
 
     logging_steps: int = 100
@@ -65,10 +64,13 @@ class CollaborativeArguments:
         metadata={"help": "Perform optimizer step after all peers collectively accumulate this many samples"},
     )
     matchmaking_time: float = field(
-        default=15.0, metadata={"help": "Averaging group will wait for stragglers for at most this many seconds"}
+        default=30.0, metadata={"help": "Averaging group will wait for stragglers for at most this many seconds"}
+    )
+    allreduce_timeout_timeout: float = field(
+        default=60, metadata={"help": "Give up on a given all-reduce round after this many seconds"}
     )
     averaging_timeout: float = field(
-        default=120, metadata={"help": "Give up on averaging step after this many seconds"}
+        default=180, metadata={"help": "Give up on averaging step after this many seconds"}
     )
     reuse_grad_buffers: bool = field(default=True, metadata={
         "help": "Whether or not to use model's .grad buffers for accumulating gradients across local steps. This "

+ 5 - 5
manage_scaleset.py

@@ -15,7 +15,7 @@ LOCATION = "northeurope"
 ADMIN_PASS = os.environ['AZURE_PASS']
 
 SCALE_SETS = ('worker',)
-SWARM_SIZE = 16
+SWARM_SIZE = 64
 
 WORKER_CLOUD_INIT = """#cloud-config
 package_update: true
@@ -37,7 +37,7 @@ write_files:
       conda install python~=3.8.0 pip
       conda install pytorch cudatoolkit=11.1 -c pytorch -c nvidia
       conda clean --all
-      pip install https://github.com/learning-at-home/hivemind/archive/c328698b8668e6c548a571c175d045ac7df08586.zip
+      pip install https://github.com/learning-at-home/hivemind/archive/scaling_tweaks.zip
       systemctl enable testserv
       systemctl start testserv
   - path: /etc/systemd/system/testserv.service
@@ -63,15 +63,15 @@ write_files:
       cd /home/hivemind
       ulimit -n 8192
       
-      git clone https://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@github.com/learning-at-home/dalle-hivemind.git -b main
+      git clone https://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@github.com/learning-at-home/dalle-hivemind.git -b azure
       cd dalle-hivemind
       pip install -r requirements.txt
       pip install -U transformers==4.10.2 datasets==1.11.0
       
       WANDB_API_KEY=7cc938e45e63ef7d2f88f811be240ba0395c02dd python run_trainer.py --run_name $(hostname) \
          --experiment_prefix dalle_large_5groups \
-         --initial_peers /ip4/172.16.1.66/tcp/31334/p2p/QmZLrSPKAcP4puJ8gUGvQ155thk5Q6J7oE5exMUSq1oD5i \
-         --per_device_train_batch_size 4 --gradient_accumulation_steps 2 --fp16
+         --initial_peers /ip4/52.232.13.142/tcp/31334/p2p/QmZLrSPKAcP4puJ8gUGvQ155thk5Q6J7oE5exMUSq1oD5i \
+         --per_device_train_batch_size 1 --gradient_accumulation_steps 1
 runcmd:
   - bash /home/hivemind/init_worker.sh
 """