3 éve · 50048e2b53
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -35,6 +35,7 @@ jobs:
 
				       - name: Test
			
 
				         run: |
			
 
				           cd tests
			
 
				+          export HIVEMIND_MEMORY_SHARING_STRATEGY=file_descriptor
			
 
				           pytest --durations=0 --durations-min=1.0 -v
			
 
				   build_and_test_p2pd:
			
 
				     runs-on: ubuntu-latest
			
@@ -61,6 +62,7 @@ jobs:
 
				       - name: Test
			
 
				         run: |
			
 
				           cd tests
			
 
				+          export HIVEMIND_MEMORY_SHARING_STRATEGY=file_descriptor
			
 
				           pytest -k "p2p" -v
			
 
				   codecov_in_develop_mode:
			
 
				 
			
@@ -87,6 +89,7 @@ jobs:
 
				           pip install -e . --no-use-pep517
			
 
				       - name: Test
			
 
				         run: |
			
 
				-          pytest --cov=hivemind -v tests
			
 
				+          export HIVEMIND_MEMORY_SHARING_STRATEGY=file_descriptor
			
 
				+          pytest --cov hivemind -v tests
			
 
				       - name: Upload coverage to Codecov
			
 
				         uses: codecov/codecov-action@v1
			
--- a/examples/albert/README.md
+++ b/examples/albert/README.md
@@ -55,9 +55,7 @@ To join the collaboration with a GPU trainer,
 
				   (see [default paths](./arguments.py#L117-L134) for reference)
			
 
				 - Run:
			
 
				   ```bash
			
 
				-  ./run_trainer.py \
			
 
				-      --initial_peers ONE_OR_MORE_PEERS \
			
 
				-      --logging_first_step --output_dir ./outputs --overwrite_output_dir --logging_dir ./logs
			
 
				+  ./run_trainer.py  --initial_peers ONE_OR_MORE_PEERS --per_device_train_batch_size BATCH_SIZE_FOR_YOUR_GPU
			
 
				   ```
			
 
				 
			
 
				   Here, `ONE_OR_MORE_PEERS` stands for multiaddresses of one or multiple existing peers (training monitors or existing
			
@@ -82,6 +80,9 @@ To join the collaboration with a GPU trainer,
 
				   You may need to change the IP address to a publicly visible one if some of the initial peers are located behind NAT.
			
 
				   If you have any trouble doing this, consider the ["Using IPFS"](#using-ipfs) section.
			
 
				 
			
 
				+  The `BATCH_SIZE_FOR_YOUR_GPU` should be tweaked so that the model fits into your GPU memory.
			
 
				+  For 1080Ti or 2080Ti gpus, a good initial value is 4. For 8GB GPUs, try batch size 1-2.
			
 
				+
			
 
				 See the ["Tips and tricks"](#tips-and-tricks) section for more information on setting up collaborative training.
			
 
				 
			
 
				 As the peer begins training, it will periodically report training logs in the following form:
			
--- a/examples/albert/arguments.py
+++ b/examples/albert/arguments.py
@@ -6,7 +6,7 @@ from transformers import TrainingArguments
 
				 
			
 
				 @dataclass
			
 
				 class BaseTrainingArguments:
			
 
				-    experiment_prefix: str = field(
			
 
				+    run_id: str = field(
			
 
				         default="albert", metadata={"help": "A unique 'name' of this experiment, used to store metadata on the DHT"}
			
 
				     )
			
 
				     initial_peers: List[str] = field(
			
@@ -127,7 +127,7 @@ class AlbertTrainingArguments(TrainingArguments):
 
				     gradient_accumulation_steps: int = 2
			
 
				     seq_length: int = 512
			
 
				 
			
 
				-    max_steps: int = 125_000  # please note: this affects both number of steps and learning rate schedule
			
 
				+    total_steps: int = 125_000  # please note: this only affects the learning rate schedule
			
 
				     learning_rate: float = 0.00176
			
 
				     warmup_steps: int = 5000
			
 
				     adam_epsilon: float = 1e-6
			
@@ -138,9 +138,14 @@ class AlbertTrainingArguments(TrainingArguments):
 
				     fp16: bool = True
			
 
				     fp16_opt_level: str = "O2"
			
 
				     do_train: bool = True
			
 
				+    do_eval: bool = False
			
 
				 
			
 
				+    logging_dir: str = "logs"
			
 
				+    output_dir: str = "outputs"
			
 
				     logging_steps: int = 100
			
 
				+    logging_first_step: bool = True
			
 
				+    overwrite_output_dir: bool = True
			
 
				+
			
 
				     save_total_limit: int = 2
			
 
				     save_steps: int = 500
			
 
				-
			
 
				-    output_dir: str = "outputs"
			
 
				+    max_steps: int = 10 ** 30  # meant as "peer should compute gradients forever"
			
--- a/examples/albert/run_trainer.py
+++ b/examples/albert/run_trainer.py
@@ -215,7 +215,7 @@ def main():
 
				     # This data collator will take care of randomly masking the tokens.
			
 
				     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
			
 
				 
			
 
				-    validators, local_public_key = utils.make_validators(collaboration_args.experiment_prefix)
			
 
				+    validators, local_public_key = utils.make_validators(collaboration_args.run_id)
			
 
				 
			
 
				     dht = DHT(
			
 
				         start=True,
			
@@ -260,12 +260,12 @@ def main():
 
				     ]
			
 
				 
			
 
				     scheduler = lambda opt: get_linear_schedule_with_warmup(
			
 
				-        opt, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
			
 
				+        opt, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.total_steps
			
 
				     )
			
 
				 
			
 
				     optimizer = Optimizer(
			
 
				         dht=dht,
			
 
				-        run_id=collaboration_args.experiment_prefix,
			
 
				+        run_id=collaboration_args.run_id,
			
 
				         target_batch_size=adjusted_target_batch_size,
			
 
				         batch_size_per_step=total_batch_size_per_step,
			
 
				         optimizer=opt,
			
--- a/examples/albert/run_training_monitor.py
+++ b/examples/albert/run_training_monitor.py
@@ -9,7 +9,7 @@ import requests
 
				 import torch
			
 
				 import wandb
			
 
				 from torch_optimizer import Lamb
			
 
				-from transformers import AlbertConfig, AlbertForPreTraining, HfArgumentParser
			
 
				+from transformers import AlbertConfig, AlbertForPreTraining, HfArgumentParser, get_linear_schedule_with_warmup
			
 
				 
			
 
				 import hivemind
			
 
				 from hivemind.optim.state_averager import TrainingStateAverager
			
@@ -40,6 +40,7 @@ class TrainingMonitorArguments(BaseTrainingArguments):
 
				     wandb_project: Optional[str] = field(
			
 
				         default=None, metadata={"help": "Name of Weights & Biases project to report the training progress to"}
			
 
				     )
			
 
				+    store_checkpoints: bool = field(default=True, metadata={"help": "If False, disables periodic checkpoint saving"})
			
 
				     save_checkpoint_step_interval: int = field(
			
 
				         default=5, metadata={"help": "Frequency (in steps) of fetching and saving state from peers"}
			
 
				     )
			
@@ -56,7 +57,6 @@ class TrainingMonitorArguments(BaseTrainingArguments):
 
				     upload_interval: Optional[float] = field(
			
 
				         default=None, metadata={"help": "Frequency (in seconds) of uploading the model to Hub"}
			
 
				     )
			
 
				-    store_checkpoints: bool = field(default=False, metadata={"help": "If True, enables CheckpointHandler"})
			
 
				 
			
 
				 
			
 
				 class CheckpointHandler:
			
@@ -99,7 +99,8 @@ class CheckpointHandler:
 
				         self.state_averager = TrainingStateAverager(
			
 
				             dht=dht,
			
 
				             optimizer=opt,
			
 
				-            prefix=experiment_prefix,
			
 
				+            scheduler=get_linear_schedule_with_warmup(opt, num_warmup_steps=5000, num_training_steps=125_000),
			
 
				+            prefix=f"{run_id}_state_averager",
			
 
				             state_compression=hivemind.Float16Compression(),
			
 
				             bandwidth=optimizer_args.bandwidth,
			
 
				             client_mode=optimizer_args.client_mode,
			
@@ -155,8 +156,8 @@ if __name__ == "__main__":
 
				         version = ip_address(address).version
			
 
				         monitor_args.announce_maddrs += [f"/ip{version}/{address}/tcp/0"]
			
 
				 
			
 
				-    experiment_prefix = monitor_args.experiment_prefix
			
 
				-    validators, local_public_key = utils.make_validators(experiment_prefix)
			
 
				+    run_id = monitor_args.run_id
			
 
				+    validators, local_public_key = utils.make_validators(run_id)
			
 
				 
			
 
				     dht = hivemind.DHT(
			
 
				         start=True,
			
@@ -177,7 +178,7 @@ if __name__ == "__main__":
 
				         checkpoint_handler = CheckpointHandler(monitor_args, optimizer_args, averager_args, dht)
			
 
				 
			
 
				     while True:
			
 
				-        metrics_dict = dht.get(experiment_prefix + "_metrics", latest=True)
			
 
				+        metrics_dict = dht.get(run_id + "_metrics", latest=True)
			
 
				         if metrics_dict is not None:
			
 
				             metrics_dict = metrics_dict.value
			
 
				             metrics = [utils.LocalMetrics.parse_obj(metrics_dict[peer].value) for peer in metrics_dict]
			
--- a/examples/albert/utils.py
+++ b/examples/albert/utils.py
@@ -24,9 +24,9 @@ class MetricSchema(BaseModel):
 
				     metrics: Dict[BytesWithPublicKey, LocalMetrics]
			
 
				 
			
 
				 
			
 
				-def make_validators(experiment_prefix: str) -> Tuple[List[RecordValidatorBase], bytes]:
			
 
				+def make_validators(run_id: str) -> Tuple[List[RecordValidatorBase], bytes]:
			
 
				     signature_validator = RSASignatureValidator()
			
 
				-    validators = [SchemaValidator(MetricSchema, prefix=experiment_prefix), signature_validator]
			
 
				+    validators = [SchemaValidator(MetricSchema, prefix=run_id), signature_validator]
			
 
				     return validators, signature_validator.local_public_key
			
 
				 
			
 
				 
			
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,8 @@
 
				-pytest
			
 
				+pytest==6.2.5  # see https://github.com/pytest-dev/pytest/issues/9621
			
 
				 pytest-forked
			
 
				 pytest-asyncio==0.16.0
			
 
				 pytest-cov
			
 
				+coverage==6.0.2  # see https://github.com/pytest-dev/pytest-cov/issues/520
			
 
				 tqdm
			
 
				 scikit-learn
			
 
				 torchvision