run_servers_single_machine.sh 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # !/usr/bin/env bash
  2. #################
  3. # Parse options #
  4. #################
  5. instructions() {
  6. echo "Usage: $0 [-n] [-c]" >&2
  7. echo " -n: number of servers to run" >&2
  8. echo " -c: path to the server configs" >&2
  9. exit 1
  10. }
  11. if [ $# != 4 ]; then
  12. instructions
  13. fi
  14. while getopts ":n:c:t:" option; do
  15. case $option in
  16. n) NUM_SERVERS=${OPTARG}
  17. ;;
  18. c) CONFIG_PATH=${OPTARG}
  19. ;;
  20. \?) instructions
  21. ;;
  22. esac
  23. done
  24. ###########################
  25. # Install or activate env #
  26. ###########################
  27. source ~/miniconda3/etc/profile.d/conda.sh
  28. if conda env list | grep ".*bloom-demo.*" >/dev/null 2>/dev/null; then
  29. conda activate bloom-demo
  30. else
  31. conda create -y --name bloom-demo python=3.8.12 pip
  32. conda activate bloom-demo
  33. conda install -y -c conda-forge cudatoolkit-dev==11.3.1 cudatoolkit==11.3.1 cudnn==8.2.1.32
  34. # Specify -i https://pypi.org/simple at Ultramar
  35. pip install -i https://pypi.org/simple torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
  36. pip install -i https://pypi.org/simple accelerate==0.10.0 huggingface-hub==0.7.0 hivemind==1.1.0
  37. pip install -i https://pypi.org/simple bitsandbytes-cuda113==0.26.0
  38. pip install -i https://pypi.org/simple https://github.com/huggingface/transformers/archive/6589e510fa4e6c442059de2fab84752535de9b23.zip
  39. fi
  40. #######################
  41. # Create Initial peer #
  42. #######################
  43. hivemind-dht 2> tmp.out &
  44. PID=$! # How to get multiple pids initiated by hivemind-dht?
  45. sleep 3
  46. INITIAL_PEER=$(python -c "with open('tmp.out') as f: print(f.readlines()[1].split()[-1])" )
  47. echo "Initial peer: ${INITIAL_PEER}"
  48. ##############################
  49. # Initialize the config file #
  50. ##############################
  51. typeset -A cfg
  52. cfg=( # set default values in config array
  53. [device]="cpu"
  54. [block_ids]="1:2"
  55. [port]="30000"
  56. )
  57. ###############
  58. # Run servers #
  59. ###############
  60. for SERVER_ID in $(seq 0 $(( $NUM_SERVERS - 1 )) )
  61. do
  62. ###############
  63. # Read config #
  64. ###############
  65. while read line
  66. do
  67. if echo $line | grep -F = &>/dev/null
  68. then
  69. varname=$(echo "$line" | cut -d '=' -f 1)
  70. cfg[$varname]=$(echo "$line" | cut -d '=' -f 2-)
  71. fi
  72. done < ${CONFIG_PATH}/server_${SERVER_ID}.cfg
  73. echo "================="
  74. echo "Server ${SERVER_ID}"
  75. echo "Device: ${cfg[device]}"
  76. echo "Bloom block ids: ${cfg[block_ids]}"
  77. echo "Port: ${cfg[port]}"
  78. echo "================="
  79. ##############
  80. # Run server #
  81. ##############
  82. tmux new-session -d -s "Server_${SERVER_ID}" bash deploy_server.sh -i ${INITIAL_PEER} -d ${cfg[device]} -s ${SERVER_ID} -b ${cfg[block_ids]} -p ${cfg[port]}
  83. done
  84. #####################
  85. # Kill initial peer #
  86. #####################
  87. sleep 10
  88. pkill -f hivemind-dht # TODO: kill only particular pids of hivemind-dht
  89. rm tmp.out