Jelajahi Sumber

Tweak settings for the upcoming demo (#2)

Co-authored-by: justheuristic <justheuristic@gmail.com>
Alexander Borzunov 3 tahun lalu
induk
melakukan
c365b2ec9f
4 mengubah file dengan 244 tambahan dan 9 penghapusan
  1. 6 4
      arguments.py
  2. 236 0
      manage_scaleset.py
  3. 1 3
      run_trainer.py
  4. 1 2
      task.py

+ 6 - 4
arguments.py

@@ -15,7 +15,7 @@ class HFTrainerArguments(TrainingArguments):
     text_seq_length: int = 256
 
     # DALLE-specific params
-    learning_rate: float = 0.003535
+    learning_rate: float = 0.0025
     adam_beta1: float = 0.9
     adam_beta2: float = 0.96
     max_grad_norm: float = 4.0
@@ -27,7 +27,6 @@ class HFTrainerArguments(TrainingArguments):
     clamp_value: float = 10000.0
 
     fp16: bool = False
-    fp16_opt_level: str = "O2"
     do_train: bool = True
 
     logging_steps: int = 100
@@ -61,14 +60,17 @@ class TPUTrainerArguments(HFTrainerArguments):
 class CollaborativeArguments:
     """Configuration for CollaborativeOptimzier and its internals"""
     target_batch_size: int = field(
-        default=16384,
+        default=4096,
         metadata={"help": "Perform optimizer step after all peers collectively accumulate this many samples"},
     )
     matchmaking_time: float = field(
         default=15.0, metadata={"help": "Averaging group will wait for stragglers for at most this many seconds"}
     )
+    allreduce_timeout: float = field(
+        default=60, metadata={"help": "Give up on a given all-reduce round after this many seconds"}
+    )
     averaging_timeout: float = field(
-        default=120, metadata={"help": "Give up on averaging step after this many seconds"}
+        default=180, metadata={"help": "Give up on averaging step after this many seconds"}
     )
     reuse_grad_buffers: bool = field(default=True, metadata={
         "help": "Whether or not to use model's .grad buffers for accumulating gradients across local steps. This "

+ 236 - 0
manage_scaleset.py

@@ -0,0 +1,236 @@
+import os
+from argparse import ArgumentParser
+from base64 import b64encode
+
+from azure.identity import DefaultAzureCredential
+from azure.mgmt.compute import ComputeManagementClient
+from azure.mgmt.network import NetworkManagementClient
+from azure.mgmt.resource import ResourceManagementClient
+
+
+print("=======================WARNING=======================")
+print("= The code may fail to import 'gi' but that is okay =")
+print("===================END OF WARNING====================")
+SUBSCRIPTION_ID = os.environ["SUBSCRIPTION_ID"]
+GROUP_NAME = "dalle_west2"
+NETWORK_NAME = "vnet"
+SUBNET_NAME = "subnet"
+LOCATION = "westus2"
+ADMIN_PASS = os.environ['AZURE_PASS']
+
+SCALE_SETS = ('worker',)
+SWARM_SIZE = 4
+
+WORKER_CLOUD_INIT = """#cloud-config
+package_update: true
+packages:
+  - build-essential
+  - wget
+  - git
+  - vim
+write_files:
+  - path: /home/hivemind/init_worker.sh
+    permissions: '0766'
+    owner: root:root
+    content: |
+      #!/usr/bin/env bash
+      set -e
+      wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O install_miniconda.sh
+      bash install_miniconda.sh -b -p /opt/conda
+      export PATH="/opt/conda/bin:${PATH}"
+      conda install python~=3.8.0 pip
+      conda install pytorch cudatoolkit=11.1 -c pytorch -c nvidia
+      conda clean --all
+      pip install https://github.com/learning-at-home/hivemind/archive/scaling_tweaks.zip
+      systemctl enable testserv
+      systemctl start testserv
+  - path: /etc/systemd/system/testserv.service
+    permissions: '0777'
+    owner: root:root
+    content: |
+      [Unit]
+      Description=One Shot
+
+      [Service]
+      ExecStart=/etc/createfile
+      Type=oneshot
+      RemainAfterExit=yes
+
+      [Install]
+      WantedBy=multi-user.target
+  - path: /etc/createfile
+    permissions: '0777'
+    owner: root:root
+    content: |
+      #!/bin/bash
+      export PATH="/opt/conda/bin:${PATH}"
+      cd /home/hivemind
+      ulimit -n 8192
+      
+      git clone https://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@github.com/learning-at-home/dalle-hivemind.git -b azure
+      cd dalle-hivemind
+      pip install -r requirements.txt
+      pip install -U transformers==4.10.2 datasets==1.11.0
+      
+      WANDB_API_KEY=7cc938e45e63ef7d2f88f811be240ba0395c02dd python run_trainer.py --run_name $(hostname) \
+         --experiment_prefix dalle_large_5groups \
+         --initial_peers /ip4/52.232.13.142/tcp/31334/p2p/QmZLrSPKAcP4puJ8gUGvQ155thk5Q6J7oE5exMUSq1oD5i \
+         --per_device_train_batch_size 1 --gradient_accumulation_steps 1
+runcmd:
+  - bash /home/hivemind/init_worker.sh
+"""
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('command', choices=('create', 'delete'))
+    args = parser.parse_args()
+
+    resource_client = ResourceManagementClient(
+        credential=DefaultAzureCredential(),
+        subscription_id=SUBSCRIPTION_ID
+    )
+    network_client = NetworkManagementClient(
+        credential=DefaultAzureCredential(),
+        subscription_id=SUBSCRIPTION_ID
+    )
+    compute_client = ComputeManagementClient(
+        credential=DefaultAzureCredential(),
+        subscription_id=SUBSCRIPTION_ID
+    )
+
+    # Create resource group
+    resource_client.resource_groups.create_or_update(
+        GROUP_NAME,
+        {"location": LOCATION}
+    )
+
+    # Create virtual network
+    network_client.virtual_networks.begin_create_or_update(
+        GROUP_NAME,
+        NETWORK_NAME,
+        {
+            'location': LOCATION,
+            'address_space': {
+                'address_prefixes': ['10.0.0.0/16']
+            }
+        }
+    ).result()
+
+    subnet = network_client.subnets.begin_create_or_update(
+        GROUP_NAME,
+        NETWORK_NAME,
+        SUBNET_NAME,
+        {'address_prefix': '10.0.0.0/16'}
+    ).result()
+
+    if args.command == 'create':
+
+        scalesets = []
+
+        for scaleset_name in SCALE_SETS:
+            cloud_init_cmd = WORKER_CLOUD_INIT
+            vm_image = {
+                "exactVersion": "21.06.0",
+                "offer": "ngc_base_image_version_b",
+                "publisher": "nvidia",
+                "sku": "gen2_21-06-0",
+                "version": "latest",
+            }
+
+            vm_config = {
+                "sku": {
+                    "tier": "Standard",
+                    "capacity": SWARM_SIZE,
+                    "name": "Standard_NC4as_T4_v3"
+                },
+                "plan": {
+                    "name": "gen2_21-06-0",
+                    "publisher": "nvidia",
+                    "product": "ngc_base_image_version_b"
+                },
+                "location": LOCATION,
+                "virtual_machine_profile": {
+                    "storage_profile": {
+                        "image_reference": vm_image,
+                        "os_disk": {
+                            "caching": "ReadWrite",
+                            "managed_disk": {"storage_account_type": "Standard_LRS"},
+                            "create_option": "FromImage",
+                            "disk_size_gb": "32",
+                        },
+                    },
+                    "os_profile": {
+                        "computer_name_prefix": scaleset_name,
+                        "admin_username": "hivemind",
+                        "admin_password": ADMIN_PASS,
+                        "linux_configuration": {
+                            "disable_password_authentication": True,
+                            "ssh": {
+                                "public_keys": [
+                                    {
+                                        "key_data": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDPFugAsrqEsqxj+hKDTfgrtkY26jqCjRubT5vhnJLhtkDAqe5vJ1donWfUVhtBfnqGr92LPmJewPUd9hRa1i33FLVVdkFAs5/Cg8/YbzR8B8e1Y+Nl5HeT7Dq1i+cPEbA1EZAm9tqK4VWYeCMd3CDkoJVuweTwyja08mxtnVNwKCeY4oBKQCE5QlliAKaQnGpJE6MRnbudWM9Ly1wM6OaJVdGwsfPfEG/sSDip4q/8x/KGAzKbhE6ax15Yu/Bu12ahcIdScQsYK9Y6Sm57MHQQLWQO1G+3h3oCTXQ0BGaSMWKXsjmHsB7f9kLZ1j8yMoGlgbpWbjB0ZVsK/4Zh8Ho3h9gDXADzt1j69qT1aERWCt7fxp9+WOLsCTw1W/W9FY2Ia4niVh2/wEwT9AcOBcAqBl7kXQAoUpP8b2Xb+KNXyTEtVB562EdFn+LmG1gZAy8J3piy2/zoo16QJP5PjpKW5GFxL6BRYLtG+uxgx1Glya617T0dtJF/X2vxjT45QK3FaFH1Zd+vhpcLg94fOPNPEhNU7EeBVp8CGYNd+aXVIPsb0I7EIVu9wWi3/a7y86cUedal61fEigfmAQkC7AHYiAiiT94eARj0N+KgjEy2UOITSCJJTHuamYWO8jZc/n7yAqr6mxOKn5ZjBTfAR9bNB/D+HpL6yepI1UDGBVk4DQ== justHeuristic@gmail.com\n",
+                                        "path": "/home/hivemind/.ssh/authorized_keys"
+                                    }
+                                ]
+                            }
+                        },
+                        "custom_data": b64encode(cloud_init_cmd.encode('utf-8')).decode('latin-1'),
+                    },
+                    "network_profile": {
+                        "network_interface_configurations": [
+                            {
+                                "name": "test",
+                                "primary": True,
+                                "enable_accelerated_networking": True,
+                                "ip_configurations": [
+                                    {
+                                        "name": "test",
+                                        "subnet": {
+                                            "id": f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{GROUP_NAME}/providers/Microsoft.Network/virtualNetworks/{NETWORK_NAME}/subnets/{SUBNET_NAME}"
+                                        },
+                                        "public_ip_address_configuration": {
+                                            "name": "pub1",
+                                            "idle_timeout_in_minutes": 15
+                                        }
+
+                                    }
+                                ]
+                            }
+                        ]
+                    },
+                    "diagnostics_profile": {"boot_diagnostics": {"enabled": True}},
+                    "priority": "spot",
+                    "eviction_policy": "deallocate",
+                },
+                "upgrade_policy": {
+                    "mode": "Manual"
+                },
+                "upgrade_mode": "Manual",
+                "spot_restore_policy": {"enabled": True}
+            }
+
+            # Create virtual machine scale set
+            vmss = compute_client.virtual_machine_scale_sets.begin_create_or_update(
+                GROUP_NAME,
+                scaleset_name,
+                vm_config,
+            )
+            print(f"{scaleset_name} {vmss.status()}")
+            scalesets.append(vmss)
+
+        for scaleset_name, vmss in zip(SCALE_SETS, scalesets):
+            print(f"Created scale set {scaleset_name}:\n{vmss.result()}")
+
+    else:
+        delete_results = []
+        for scaleset_name in SCALE_SETS:
+            delete_results.append(compute_client.virtual_machine_scale_sets.begin_delete(GROUP_NAME, scaleset_name))
+
+        for scaleset_name, delete_result in zip(SCALE_SETS, delete_results):
+            delete_result.result()
+            print(f"Deleted scale set {scaleset_name}")
+
+
+if __name__ == "__main__":
+    main()

+ 1 - 3
run_trainer.py

@@ -20,7 +20,7 @@ transformers.utils.logging.set_verbosity_warning()
 use_hivemind_log_handler("in_root_logger")
 logger = get_logger(__name__)
 
-torch.set_num_threads(min(torch.get_num_threads(), 4))  # Otherwise, it becomes very slow on machines with ~100 CPUs
+torch.set_num_threads(1)  # Otherwise, it becomes very slow on machines with ~100 CPUs
 
 
 def main():
@@ -28,8 +28,6 @@ def main():
     training_peer_args, trainer_args, collab_args = parser.parse_args_into_dataclasses()
 
     logger.info(f"Trying {len(training_peer_args.initial_peers)} initial peers: {training_peer_args.initial_peers}")
-    # if len(training_peer_args.initial_peers) == 0:
-    #     logger.warning("Please specify at least one network endpoint in initial peers.")
 
     utils.log_process_rank(trainer_args)
     task = TrainingTask(training_peer_args, trainer_args, collab_args)

+ 1 - 2
task.py

@@ -121,8 +121,7 @@ class TrainingTask:
             self._collaborative_optimizer = hivemind.Optimizer(
                 dht=self.dht, run_id=self.peer_args.experiment_prefix,
                 params=params, optimizer=opt, scheduler=scheduler,
-                offload_optimizer=True,
-                delay_grad_averaging=False, delay_optimizer_step=True,
+                offload_optimizer=True, delay_grad_averaging=False, delay_optimizer_step=True,
                 batch_size_per_step=self.trainer_args.batch_size_per_step,
                 grad_compression=averaging_compression, state_averaging_compression=averaging_compression,
                 client_mode=self.peer_args.client_mode, verbose=True,