3 tahun lalu · c365b2ec9f
--- a/arguments.py
+++ b/arguments.py
@@ -15,7 +15,7 @@ class HFTrainerArguments(TrainingArguments):
 
				     text_seq_length: int = 256
			
 
				 
			
 
				     # DALLE-specific params
			
 
				-    learning_rate: float = 0.003535
			
 
				+    learning_rate: float = 0.0025
			
 
				     adam_beta1: float = 0.9
			
 
				     adam_beta2: float = 0.96
			
 
				     max_grad_norm: float = 4.0
			
@@ -27,7 +27,6 @@ class HFTrainerArguments(TrainingArguments):
 
				     clamp_value: float = 10000.0
			
 
				 
			
 
				     fp16: bool = False
			
 
				-    fp16_opt_level: str = "O2"
			
 
				     do_train: bool = True
			
 
				 
			
 
				     logging_steps: int = 100
			
@@ -61,14 +60,17 @@ class TPUTrainerArguments(HFTrainerArguments):
 
				 class CollaborativeArguments:
			
 
				     """Configuration for CollaborativeOptimzier and its internals"""
			
 
				     target_batch_size: int = field(
			
 
				-        default=16384,
			
 
				+        default=4096,
			
 
				         metadata={"help": "Perform optimizer step after all peers collectively accumulate this many samples"},
			
 
				     )
			
 
				     matchmaking_time: float = field(
			
 
				         default=15.0, metadata={"help": "Averaging group will wait for stragglers for at most this many seconds"}
			
 
				     )
			
 
				+    allreduce_timeout: float = field(
			
 
				+        default=60, metadata={"help": "Give up on a given all-reduce round after this many seconds"}
			
 
				+    )
			
 
				     averaging_timeout: float = field(
			
 
				-        default=120, metadata={"help": "Give up on averaging step after this many seconds"}
			
 
				+        default=180, metadata={"help": "Give up on averaging step after this many seconds"}
			
 
				     )
			
 
				     reuse_grad_buffers: bool = field(default=True, metadata={
			
 
				         "help": "Whether or not to use model's .grad buffers for accumulating gradients across local steps. This "
			
--- a/manage_scaleset.py
+++ b/manage_scaleset.py
@@ -0,0 +1,236 @@
 
				+import os
			
 
				+from argparse import ArgumentParser
			
 
				+from base64 import b64encode
			
 
				+
			
 
				+from azure.identity import DefaultAzureCredential
			
 
				+from azure.mgmt.compute import ComputeManagementClient
			
 
				+from azure.mgmt.network import NetworkManagementClient
			
 
				+from azure.mgmt.resource import ResourceManagementClient
			
 
				+
			
 
				+
			
 
				+print("=======================WARNING=======================")
			
 
				+print("= The code may fail to import 'gi' but that is okay =")
			
 
				+print("===================END OF WARNING====================")
			
 
				+SUBSCRIPTION_ID = os.environ["SUBSCRIPTION_ID"]
			
 
				+GROUP_NAME = "dalle_west2"
			
 
				+NETWORK_NAME = "vnet"
			
 
				+SUBNET_NAME = "subnet"
			
 
				+LOCATION = "westus2"
			
 
				+ADMIN_PASS = os.environ['AZURE_PASS']
			
 
				+
			
 
				+SCALE_SETS = ('worker',)
			
 
				+SWARM_SIZE = 4
			
 
				+
			
 
				+WORKER_CLOUD_INIT = """#cloud-config
			
 
				+package_update: true
			
 
				+packages:
			
 
				+  - build-essential
			
 
				+  - wget
			
 
				+  - git
			
 
				+  - vim
			
 
				+write_files:
			
 
				+  - path: /home/hivemind/init_worker.sh
			
 
				+    permissions: '0766'
			
 
				+    owner: root:root
			
 
				+    content: |
			
 
				+      #!/usr/bin/env bash
			
 
				+      set -e
			
 
				+      wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O install_miniconda.sh
			
 
				+      bash install_miniconda.sh -b -p /opt/conda
			
 
				+      export PATH="/opt/conda/bin:${PATH}"
			
 
				+      conda install python~=3.8.0 pip
			
 
				+      conda install pytorch cudatoolkit=11.1 -c pytorch -c nvidia
			
 
				+      conda clean --all
			
 
				+      pip install https://github.com/learning-at-home/hivemind/archive/scaling_tweaks.zip
			
 
				+      systemctl enable testserv
			
 
				+      systemctl start testserv
			
 
				+  - path: /etc/systemd/system/testserv.service
			
 
				+    permissions: '0777'
			
 
				+    owner: root:root
			
 
				+    content: |
			
 
				+      [Unit]
			
 
				+      Description=One Shot
			
 
				+
			
 
				+      [Service]
			
 
				+      ExecStart=/etc/createfile
			
 
				+      Type=oneshot
			
 
				+      RemainAfterExit=yes
			
 
				+
			
 
				+      [Install]
			
 
				+      WantedBy=multi-user.target
			
 
				+  - path: /etc/createfile
			
 
				+    permissions: '0777'
			
 
				+    owner: root:root
			
 
				+    content: |
			
 
				+      #!/bin/bash
			
 
				+      export PATH="/opt/conda/bin:${PATH}"
			
 
				+      cd /home/hivemind
			
 
				+      ulimit -n 8192
			
 
				+      
			
 
				+      git clone https://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@github.com/learning-at-home/dalle-hivemind.git -b azure
			
 
				+      cd dalle-hivemind
			
 
				+      pip install -r requirements.txt
			
 
				+      pip install -U transformers==4.10.2 datasets==1.11.0
			
 
				+      
			
 
				+      WANDB_API_KEY=7cc938e45e63ef7d2f88f811be240ba0395c02dd python run_trainer.py --run_name $(hostname) \
			
 
				+         --experiment_prefix dalle_large_5groups \
			
 
				+         --initial_peers /ip4/52.232.13.142/tcp/31334/p2p/QmZLrSPKAcP4puJ8gUGvQ155thk5Q6J7oE5exMUSq1oD5i \
			
 
				+         --per_device_train_batch_size 1 --gradient_accumulation_steps 1
			
 
				+runcmd:
			
 
				+  - bash /home/hivemind/init_worker.sh
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = ArgumentParser()
			
 
				+    parser.add_argument('command', choices=('create', 'delete'))
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    resource_client = ResourceManagementClient(
			
 
				+        credential=DefaultAzureCredential(),
			
 
				+        subscription_id=SUBSCRIPTION_ID
			
 
				+    )
			
 
				+    network_client = NetworkManagementClient(
			
 
				+        credential=DefaultAzureCredential(),
			
 
				+        subscription_id=SUBSCRIPTION_ID
			
 
				+    )
			
 
				+    compute_client = ComputeManagementClient(
			
 
				+        credential=DefaultAzureCredential(),
			
 
				+        subscription_id=SUBSCRIPTION_ID
			
 
				+    )
			
 
				+
			
 
				+    # Create resource group
			
 
				+    resource_client.resource_groups.create_or_update(
			
 
				+        GROUP_NAME,
			
 
				+        {"location": LOCATION}
			
 
				+    )
			
 
				+
			
 
				+    # Create virtual network
			
 
				+    network_client.virtual_networks.begin_create_or_update(
			
 
				+        GROUP_NAME,
			
 
				+        NETWORK_NAME,
			
 
				+        {
			
 
				+            'location': LOCATION,
			
 
				+            'address_space': {
			
 
				+                'address_prefixes': ['10.0.0.0/16']
			
 
				+            }
			
 
				+        }
			
 
				+    ).result()
			
 
				+
			
 
				+    subnet = network_client.subnets.begin_create_or_update(
			
 
				+        GROUP_NAME,
			
 
				+        NETWORK_NAME,
			
 
				+        SUBNET_NAME,
			
 
				+        {'address_prefix': '10.0.0.0/16'}
			
 
				+    ).result()
			
 
				+
			
 
				+    if args.command == 'create':
			
 
				+
			
 
				+        scalesets = []
			
 
				+
			
 
				+        for scaleset_name in SCALE_SETS:
			
 
				+            cloud_init_cmd = WORKER_CLOUD_INIT
			
 
				+            vm_image = {
			
 
				+                "exactVersion": "21.06.0",
			
 
				+                "offer": "ngc_base_image_version_b",
			
 
				+                "publisher": "nvidia",
			
 
				+                "sku": "gen2_21-06-0",
			
 
				+                "version": "latest",
			
 
				+            }
			
 
				+
			
 
				+            vm_config = {
			
 
				+                "sku": {
			
 
				+                    "tier": "Standard",
			
 
				+                    "capacity": SWARM_SIZE,
			
 
				+                    "name": "Standard_NC4as_T4_v3"
			
 
				+                },
			
 
				+                "plan": {
			
 
				+                    "name": "gen2_21-06-0",
			
 
				+                    "publisher": "nvidia",
			
 
				+                    "product": "ngc_base_image_version_b"
			
 
				+                },
			
 
				+                "location": LOCATION,
			
 
				+                "virtual_machine_profile": {
			
 
				+                    "storage_profile": {
			
 
				+                        "image_reference": vm_image,
			
 
				+                        "os_disk": {
			
 
				+                            "caching": "ReadWrite",
			
 
				+                            "managed_disk": {"storage_account_type": "Standard_LRS"},
			
 
				+                            "create_option": "FromImage",
			
 
				+                            "disk_size_gb": "32",
			
 
				+                        },
			
 
				+                    },
			
 
				+                    "os_profile": {
			
 
				+                        "computer_name_prefix": scaleset_name,
			
 
				+                        "admin_username": "hivemind",
			
 
				+                        "admin_password": ADMIN_PASS,
			
 
				+                        "linux_configuration": {
			
 
				+                            "disable_password_authentication": True,
			
 
				+                            "ssh": {
			
 
				+                                "public_keys": [
			
 
				+                                    {
			
 
				+                                        "key_data": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDPFugAsrqEsqxj+hKDTfgrtkY26jqCjRubT5vhnJLhtkDAqe5vJ1donWfUVhtBfnqGr92LPmJewPUd9hRa1i33FLVVdkFAs5/Cg8/YbzR8B8e1Y+Nl5HeT7Dq1i+cPEbA1EZAm9tqK4VWYeCMd3CDkoJVuweTwyja08mxtnVNwKCeY4oBKQCE5QlliAKaQnGpJE6MRnbudWM9Ly1wM6OaJVdGwsfPfEG/sSDip4q/8x/KGAzKbhE6ax15Yu/Bu12ahcIdScQsYK9Y6Sm57MHQQLWQO1G+3h3oCTXQ0BGaSMWKXsjmHsB7f9kLZ1j8yMoGlgbpWbjB0ZVsK/4Zh8Ho3h9gDXADzt1j69qT1aERWCt7fxp9+WOLsCTw1W/W9FY2Ia4niVh2/wEwT9AcOBcAqBl7kXQAoUpP8b2Xb+KNXyTEtVB562EdFn+LmG1gZAy8J3piy2/zoo16QJP5PjpKW5GFxL6BRYLtG+uxgx1Glya617T0dtJF/X2vxjT45QK3FaFH1Zd+vhpcLg94fOPNPEhNU7EeBVp8CGYNd+aXVIPsb0I7EIVu9wWi3/a7y86cUedal61fEigfmAQkC7AHYiAiiT94eARj0N+KgjEy2UOITSCJJTHuamYWO8jZc/n7yAqr6mxOKn5ZjBTfAR9bNB/D+HpL6yepI1UDGBVk4DQ== justHeuristic@gmail.com\n",
			
 
				+                                        "path": "/home/hivemind/.ssh/authorized_keys"
			
 
				+                                    }
			
 
				+                                ]
			
 
				+                            }
			
 
				+                        },
			
 
				+                        "custom_data": b64encode(cloud_init_cmd.encode('utf-8')).decode('latin-1'),
			
 
				+                    },
			
 
				+                    "network_profile": {
			
 
				+                        "network_interface_configurations": [
			
 
				+                            {
			
 
				+                                "name": "test",
			
 
				+                                "primary": True,
			
 
				+                                "enable_accelerated_networking": True,
			
 
				+                                "ip_configurations": [
			
 
				+                                    {
			
 
				+                                        "name": "test",
			
 
				+                                        "subnet": {
			
 
				+                                            "id": f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{GROUP_NAME}/providers/Microsoft.Network/virtualNetworks/{NETWORK_NAME}/subnets/{SUBNET_NAME}"
			
 
				+                                        },
			
 
				+                                        "public_ip_address_configuration": {
			
 
				+                                            "name": "pub1",
			
 
				+                                            "idle_timeout_in_minutes": 15
			
 
				+                                        }
			
 
				+
			
 
				+                                    }
			
 
				+                                ]
			
 
				+                            }
			
 
				+                        ]
			
 
				+                    },
			
 
				+                    "diagnostics_profile": {"boot_diagnostics": {"enabled": True}},
			
 
				+                    "priority": "spot",
			
 
				+                    "eviction_policy": "deallocate",
			
 
				+                },
			
 
				+                "upgrade_policy": {
			
 
				+                    "mode": "Manual"
			
 
				+                },
			
 
				+                "upgrade_mode": "Manual",
			
 
				+                "spot_restore_policy": {"enabled": True}
			
 
				+            }
			
 
				+
			
 
				+            # Create virtual machine scale set
			
 
				+            vmss = compute_client.virtual_machine_scale_sets.begin_create_or_update(
			
 
				+                GROUP_NAME,
			
 
				+                scaleset_name,
			
 
				+                vm_config,
			
 
				+            )
			
 
				+            print(f"{scaleset_name} {vmss.status()}")
			
 
				+            scalesets.append(vmss)
			
 
				+
			
 
				+        for scaleset_name, vmss in zip(SCALE_SETS, scalesets):
			
 
				+            print(f"Created scale set {scaleset_name}:\n{vmss.result()}")
			
 
				+
			
 
				+    else:
			
 
				+        delete_results = []
			
 
				+        for scaleset_name in SCALE_SETS:
			
 
				+            delete_results.append(compute_client.virtual_machine_scale_sets.begin_delete(GROUP_NAME, scaleset_name))
			
 
				+
			
 
				+        for scaleset_name, delete_result in zip(SCALE_SETS, delete_results):
			
 
				+            delete_result.result()
			
 
				+            print(f"Deleted scale set {scaleset_name}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/run_trainer.py
+++ b/run_trainer.py
@@ -20,7 +20,7 @@ transformers.utils.logging.set_verbosity_warning()
 
				 use_hivemind_log_handler("in_root_logger")
			
 
				 logger = get_logger(__name__)
			
 
				 
			
 
				-torch.set_num_threads(min(torch.get_num_threads(), 4))  # Otherwise, it becomes very slow on machines with ~100 CPUs
			
 
				+torch.set_num_threads(1)  # Otherwise, it becomes very slow on machines with ~100 CPUs
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -28,8 +28,6 @@ def main():
 
				     training_peer_args, trainer_args, collab_args = parser.parse_args_into_dataclasses()
			
 
				 
			
 
				     logger.info(f"Trying {len(training_peer_args.initial_peers)} initial peers: {training_peer_args.initial_peers}")
			
 
				-    # if len(training_peer_args.initial_peers) == 0:
			
 
				-    #     logger.warning("Please specify at least one network endpoint in initial peers.")
			
 
				 
			
 
				     utils.log_process_rank(trainer_args)
			
 
				     task = TrainingTask(training_peer_args, trainer_args, collab_args)
			
--- a/task.py
+++ b/task.py
@@ -121,8 +121,7 @@ class TrainingTask:
 
				             self._collaborative_optimizer = hivemind.Optimizer(
			
 
				                 dht=self.dht, run_id=self.peer_args.experiment_prefix,
			
 
				                 params=params, optimizer=opt, scheduler=scheduler,
			
 
				-                offload_optimizer=True,
			
 
				-                delay_grad_averaging=False, delay_optimizer_step=True,
			
 
				+                offload_optimizer=True, delay_grad_averaging=False, delay_optimizer_step=True,
			
 
				                 batch_size_per_step=self.trainer_args.batch_size_per_step,
			
 
				                 grad_compression=averaging_compression, state_averaging_compression=averaging_compression,
			
 
				                 client_mode=self.peer_args.client_mode, verbose=True,