Explorar o código

deploy swarm on local & remote machines

Dmitry Baranchuk %!s(int64=3) %!d(string=hai) anos
pai
achega
f60a7dd183

+ 11 - 12
cli/deploy_server.sh

@@ -5,12 +5,12 @@
 #################
 
 instructions() {
-  echo "Usage: $0 [-i] [ -d ] [ -s ] [ -b ] [-p] [-t]" >&2
+  echo "Usage: $0 [-i] [ -d ] [ -p ] [ -b ] [-a] [-t]" >&2
   echo " -i: initial peer"
   echo " -d: device" >&2
-  echo " -s: server_id" >&2
+  echo " -p: server identity path" >&2
   echo " -b: block_ids" >&2
-  echo " -p: port to run a server" >&2
+  echo " -a: host maddrs" >&2
   echo " -t: whether to run local tests" >&2
   exit 1
 }
@@ -19,18 +19,17 @@ if [ ! $# -ge 8 ]; then
     instructions
 fi
 
-while getopts ":i:d:s:b:p:t:" option; do
+while getopts ":i:d:p:b:a:t:" option; do
     case $option in
-        i) INITIAL_PEER=${OPTARG}
+        i)  INITIAL_PEER=${OPTARG}
             ;;
         d)  DEVICE=${OPTARG}
-            #((${DEVICE} == 'cpu' || ${DEVICE} == 'cuda')) || instructions
             ;;
-        s)  SERVER_ID=${OPTARG}
+        p)  SERVER_ID_PATH=${OPTARG}
             ;;
         b)  BLOCK_IDS=${OPTARG}
             ;;
-        p)  PORT=${OPTARG}
+        a)  HOST_MADDR=${OPTARG} # TODO: allow several maddrs 
             ;;
         t)  RUN_LOCAL_TESTS=true
             ;;
@@ -45,8 +44,8 @@ echo "= Config ="
 echo "=========="
 echo "Initial peer: ${INITIAL_PEER}"
 echo "Device: ${DEVICE}"
-echo "Server name: server${SERVER_ID}.id"
-echo "Server address: /ip4/127.0.0.1/tcp/${PORT}"
+echo "Server name: ${SERVER_ID_PATH}"
+echo "Server address: ${HOST_MADDR}"
 echo "Bloom blocks: ${BLOCK_IDS}"
 
 
@@ -54,6 +53,7 @@ echo "Bloom blocks: ${BLOCK_IDS}"
 # Install or activate env #
 ###########################
 
+# TODO fix bug with self calling
 source ~/miniconda3/etc/profile.d/conda.sh
 if conda env list | grep ".*bloom-demo.*"  >/dev/null 2>/dev/null; then
     conda activate bloom-demo
@@ -62,7 +62,6 @@ else
     conda activate bloom-demo
 
     conda install -y -c conda-forge cudatoolkit-dev==11.3.1 cudatoolkit==11.3.1 cudnn==8.2.1.32
-    # Specify -i https://pypi.org/simple at ultramar
     pip install -i https://pypi.org/simple torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
     pip install -i https://pypi.org/simple accelerate==0.10.0 huggingface-hub==0.7.0 hivemind==1.1.0
     pip install -i https://pypi.org/simple bitsandbytes-cuda113==0.26.0
@@ -85,4 +84,4 @@ fi
 ##############
 
 python -m cli.run_server --prefix bloom6b3 --converted_model_name_or_path bigscience/test-bloomd-6b3 --device ${DEVICE} --initial_peer ${INITIAL_PEER} \
-  --block_indices ${BLOCK_IDS} --torch_dtype float32 --identity_path ./server${SERVER_ID}.id --host_maddrs /ip4/127.0.0.1/tcp/${PORT} 2> server_${SERVER_ID}.out
+  --block_indices ${BLOCK_IDS} --torch_dtype float32 --identity_path ${SERVER_ID_PATH} --host_maddrs ${HOST_MADDR} &> ${SERVER_ID_PATH}.log

+ 5 - 0
cli/local_server_config_example.cfg

@@ -0,0 +1,5 @@
+device=cpu
+block_ids=2:3
+id_path=./server.id
+maddr=/ip4/127.0.0.1/tcp/30000
+#

+ 6 - 0
cli/remote_server_config_example.cfg

@@ -0,0 +1,6 @@
+name=bloom-peer-0.bloom.net
+device=cpu
+block_ids=1:3
+id_path=./server.id
+maddr=/ip4/0.0.0.0/tcp/30000
+#

+ 111 - 0
cli/run_local_servers.sh

@@ -0,0 +1,111 @@
+# !/usr/bin/env bash
+
+#################
+# Parse options #
+#################
+
+instructions() {
+  echo "Usage: $0 [-n] [-c]" >&2
+  echo " -n: number of servers to run" >&2
+  echo " -c: path to the server configs" >&2
+  exit 1
+}
+
+if [ $# != 4 ]; then
+    instructions
+fi
+
+while getopts ":n:c:t:" option; do
+    case $option in
+        n)  NUM_SERVERS=${OPTARG}
+            ;;
+        c)  CONFIG_PATH=${OPTARG}
+            ;;
+        \?) instructions
+            ;;
+   esac
+done
+
+
+###########################
+# Install or activate env #
+###########################
+
+source ~/miniconda3/etc/profile.d/conda.sh
+if conda env list | grep ".*bloom-demo.*"  &>/dev/null; then
+    conda activate bloom-demo
+else
+    conda create -y --name bloom-demo python=3.8.12 pip
+    conda activate bloom-demo
+
+    conda install -y -c conda-forge cudatoolkit-dev==11.3.1 cudatoolkit==11.3.1 cudnn==8.2.1.32
+    pip install -i https://pypi.org/simple torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
+    pip install -i https://pypi.org/simple accelerate==0.10.0 huggingface-hub==0.7.0 hivemind==1.1.0
+    pip install -i https://pypi.org/simple bitsandbytes-cuda113==0.26.0
+    pip install -i https://pypi.org/simple https://github.com/huggingface/transformers/archive/6589e510fa4e6c442059de2fab84752535de9b23.zip
+fi
+
+
+#######################
+# Create Initial peer #
+#######################
+
+hivemind-dht &> tmp.out &
+sleep 3
+INITIAL_PEER=$(python -c "with open('tmp.out') as f: print(f.readlines()[1].split()[-1])" )
+echo "Initial peer: ${INITIAL_PEER}"
+
+
+##############################
+# Initialize the config file #
+##############################
+
+typeset -A cfg 
+cfg=( # set default values in config array
+    [device]="cpu"
+    [block_ids]="1:2"
+    [id_path]="server.id"
+    [maddr]="/ip4/127.0.0.1/tcp/30000"
+)
+
+###############
+# Run servers #
+###############
+
+for SERVER_ID in $(seq 0 $(( $NUM_SERVERS - 1 )) )
+do  
+    ###############
+    # Read config #
+    ###############
+
+    while read line
+    do
+        if echo $line | grep -F = &>/dev/null
+        then
+            varname=$(echo "$line" | cut -d '=' -f 1)
+            cfg[$varname]=$(echo "$line" | cut -d '=' -f 2-)
+        fi
+    done < ${CONFIG_PATH}/server_${SERVER_ID}.cfg
+    
+    echo "=== Server #${SERVER_ID} ==="
+    echo "Server ID: ${id_path}"
+    echo "Device: ${cfg[device]}"
+    echo "Bloom block ids: ${cfg[block_ids]}"
+    echo "Host maddr: ${cfg[maddr]}"
+    echo ""
+    
+    ##############
+    # Run server #
+    ##############
+
+    tmux new-session -d -s "Server_${SERVER_ID}" bash deploy_server.sh -i ${INITIAL_PEER} -d ${cfg[device]} -p ${cfg[id_path]} -b ${cfg[block_ids]} -a ${cfg[maddr]}
+done
+
+
+#####################
+# Kill initial peer #
+#####################
+
+sleep 10
+pkill -f hivemind-dht # TODO: kill only particular pids of hivemind-dht
+rm tmp.out

+ 112 - 0
cli/run_remote_servers.sh

@@ -0,0 +1,112 @@
+# !/usr/bin/env bash
+
+SSH_KEY_PATH="~/.ssh/<YOUR_KEY>"
+
+#################
+# Parse options #
+#################
+
+instructions() {
+  echo "Usage: $0 [-u] [-n] [-c]" >&2
+  echo " -u: username" >&2
+  echo " -n: number of servers to run" >&2
+  echo " -c: path to the server configs" >&2
+  exit 1
+}
+
+if [ $# != 6 ]; then
+    instructions
+fi
+
+while getopts ":u:n:c:" option; do
+    case $option in
+        u)  USERNAME=${OPTARG}
+            ;;
+        n)  NUM_SERVERS=${OPTARG}
+            ;;
+        c)  CONFIG_PATH=${OPTARG}
+            ;;
+        \?) instructions
+            ;;
+   esac
+done
+
+
+###########################
+# Install or activate env #
+###########################
+
+source ~/miniconda3/etc/profile.d/conda.sh
+if conda env list | grep ".*bloom-demo.*"  &>/dev/null; then
+    conda activate bloom-demo
+else
+    conda create -y --name bloom-demo python=3.8.12 pip
+    conda activate bloom-demo
+
+    conda install -y -c conda-forge cudatoolkit-dev==11.3.1 cudatoolkit==11.3.1 cudnn==8.2.1.32
+    pip install -i https://pypi.org/simple torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
+    pip install -i https://pypi.org/simple accelerate==0.10.0 huggingface-hub==0.7.0 hivemind==1.1.0
+    pip install -i https://pypi.org/simple bitsandbytes-cuda113==0.26.0
+    pip install -i https://pypi.org/simple https://github.com/huggingface/transformers/archive/6589e510fa4e6c442059de2fab84752535de9b23.zip
+fi
+
+
+#######################
+# Create Initial peer #
+#######################
+
+hivemind-dht &> tmp.out &
+
+sleep 3
+INITIAL_PEER=$(python -c "with open('tmp.out') as f: print(f.readlines()[1].split()[-2])" )
+rm tmp.out
+echo "Initial peer: ${INITIAL_PEER}"
+
+
+##############################
+# Initialize the config file #
+##############################
+
+typeset -A cfg 
+cfg=( # set default values in config array
+    [name]=""
+    [device]="cpu"
+    [block_ids]="1:2"
+    [id_path]="server.id"
+    [maddr]="/ip4/0.0.0.0/tcp/30000"
+)
+
+###############
+# Run servers #
+###############
+
+for SERVER_ID in $(seq 0 $(( $NUM_SERVERS - 1 )) )
+do  
+    ###############
+    # Read config #
+    ###############
+
+    while read line
+    do
+        if echo $line | grep -F = &>/dev/null
+        then
+            varname=$(echo "$line" | cut -d '=' -f 1)
+            cfg[$varname]=$(echo "$line" | cut -d '=' -f 2-)
+        fi
+    done < ${CONFIG_PATH}/server_${SERVER_ID}.cfg
+    
+    SERVER_NAME="${USERNAME}@${cfg[name]}"
+    echo "=== Server #${SERVER_ID} ==="
+    echo "Server name ${SERVER_NAME}"
+    echo "Server ID: ${cfg[id_path]}"
+    echo "Device: ${cfg[device]}"
+    echo "Bloom block ids: ${cfg[block_ids]}"
+    echo "Host maddr: ${cfg[maddr]}"
+    echo "================="
+    
+    ##############
+    # Run server #
+    ##############
+     
+    ssh -i ${SSH_KEY_PATH} ${SERVER_NAME} "tmux new-session -d -s 'Server_${SERVER_ID}' 'cd bloom-demo && bash cli/deploy_server.sh -i ${INITIAL_PEER} -d ${cfg[device]} -p ${cfg[id_path]} -b ${cfg[block_ids]} -a ${cfg[maddr]}'"
+done