3 年之前 · a798ea04a6
--- a/README.md
+++ b/README.md
@@ -21,3 +21,14 @@ pip install bitsandbytes-cuda113==0.26.0
 
				 pip install https://github.com/learning-at-home/hivemind/archive/dac8940c324dd612d89c773b51a53e4a04c59064.zip
			
 
				 pip install https://github.com/huggingface/transformers/archive/224bde91caff4ccfd12277ab5e9bf97c61e22ee9.zip
			
 
				 ```
			
 
				+
			
 
				+
			
 
				+# tests
			
 
				+
			
 
				+```bash
			
 
				+# run one bloom block for a few steps
			
 
				+python -m cli.inference_one_block --config cli/config.json  # see other args
			
 
				+
			
 
				+# minimalistic server
			
 
				+python -m cli.run_server --block_config bigscience/bloom-6b3 --num_blocks 2
			
 
				+```
			
--- a/cli/config.json
+++ b/cli/config.json
@@ -0,0 +1,20 @@
 
				+{
			
 
				+  "apply_residual_connection_post_layernorm": false,
			
 
				+  "attention_dropout": 0.0,
			
 
				+  "attention_softmax_in_fp32": true,
			
 
				+  "bos_token_id": 1,
			
 
				+  "eos_token_id": 2,
			
 
				+  "hidden_dropout": 0.0,
			
 
				+  "initializer_range": 0.02,
			
 
				+  "layer_norm_epsilon": 1e-05,
			
 
				+  "masked_softmax_fusion": true,
			
 
				+  "model_type": "bloom",
			
 
				+  "n_embed": 14336,
			
 
				+  "n_layer": 70,
			
 
				+  "num_attention_heads": 112,
			
 
				+  "pretraining_tp": 4,
			
 
				+  "slow_but_exact": false,
			
 
				+  "transformers_version": "4.20.0.dev0",
			
 
				+  "use_cache": true,
			
 
				+  "vocab_size": 250880
			
 
				+}
			
--- a/cli/inference_one_block.py
+++ b/cli/inference_one_block.py
@@ -0,0 +1,52 @@
 
				+import argparse
			
 
				+
			
 
				+import torch
			
 
				+from hivemind.utils.logging import use_hivemind_log_handler, get_logger
			
 
				+
			
 
				+from src.bloom.model import DistributedBloomConfig
			
 
				+from src.bloom.block import BloomBlock
			
 
				+from src.bloom.ops import build_alibi_tensor
			
 
				+from tqdm.auto import trange
			
 
				+
			
 
				+
			
 
				+use_hivemind_log_handler("in_root_logger")
			
 
				+logger = get_logger(__file__)
			
 
				+
			
 
				+
			
 
				+def print_device_info(device=None):
			
 
				+    """Prints device stats. Code from https://stackoverflow.com/a/53374933/12891528"""
			
 
				+    device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
			
 
				+    logger.info(f"Using device: {device}")
			
 
				+
			
 
				+    # Additional Info when using cuda
			
 
				+    if device.type == "cuda":
			
 
				+        logger.info(torch.cuda.get_device_name(0))
			
 
				+        logger.info(f"Memory Usage:")
			
 
				+        logger.info(f"Allocated: {round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)} GB")
			
 
				+        logger.info(f"Cached:   {round(torch.cuda.memory_cached(0) / 1024 ** 3, 1)} GB")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description="Run a single bloom block locally on dummy data")
			
 
				+    parser.add_argument("--config", required=True, type=str, help="Path to a config json file")
			
 
				+    parser.add_argument("--state_dict", default=None, type=str, help="Optional path to saved block state dict")
			
 
				+    parser.add_argument("--layer_index", default=0, type=int, help="Optional path to saved block state dict")
			
 
				+    parser.add_argument("--num_steps", default=500, type=int, help="How many inference steps to run")
			
 
				+    parser.add_argument("--device", default=None, type=str, help="Run inference on this device")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if args.device is None:
			
 
				+        args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
			
 
				+
			
 
				+    config = DistributedBloomConfig.from_json_file(args.config)
			
 
				+    block = BloomBlock(config, args.layer_index).to(args.device)
			
 
				+
			
 
				+    cache = None
			
 
				+
			
 
				+    for i in trange(args.num_steps):
			
 
				+        dummy_input = torch.randn(1, 1, config.hidden_size, device=args.device)
			
 
				+        alibi = build_alibi_tensor(i + 1, config.num_attention_heads).to(args.device)
			
 
				+        with torch.no_grad():
			
 
				+            outputs, cache = block.forward(dummy_input, alibi=alibi, use_cache=True, layer_past=cache)
			
 
				+
			
 
				+    print_device_info(args.device)
			
--- a/cli/run_server.py
+++ b/cli/run_server.py
@@ -1,7 +1,3 @@
 
				-import os, sys
			
 
				-
			
 
				-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # add path to src
			
 
				-
			
 
				 import configargparse
			
 
				 
			
 
				 from hivemind.proto.runtime_pb2 import CompressionType