Explorar o código

analyze openfiles

justheuristic %!s(int64=4) %!d(string=hai) anos
pai
achega
71c7b8047f
Modificáronse 2 ficheiros con 23 adicións e 0 borrados
  1. 22 0
      examples/albert/run_trainer.py
  2. 1 0
      examples/albert/run_training_monitor.py

+ 22 - 0
examples/albert/run_trainer.py

@@ -3,9 +3,12 @@
 import logging
 import os
 import pickle
+import threading
+import time
 from dataclasses import asdict
 from pathlib import Path
 
+import psutil
 import torch
 import transformers
 from datasets import load_from_disk
@@ -27,6 +30,25 @@ logger = logging.getLogger(__name__)
 LRSchedulerBase = getattr(torch.optim.lr_scheduler, "_LRScheduler", None)
 
 
+def analyze_openfiles_periodically():
+    children = [psutil.Process()] + psutil.Process().children(recursive=True)
+    while True:
+        logger.info("Scanning open files")
+        for child in children:
+            open_files = child.open_files()
+            logger.info(f"proc: '{child.name()}' files: {len(open_files)}")
+        for child in children:
+            open_files = child.open_files()
+            if len(open_files) > 100:
+                logger.info(f"proc: {child.name()} has {len(open_files)} open files: {repr(open_files)}")
+        logger.info("DONE scanning")
+        time.sleep(30)
+
+
+analyzer = threading.Thread(target=analyze_openfiles_periodically)
+analyzer.start()
+
+
 def setup_logging(training_args):
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",

+ 1 - 0
examples/albert/run_training_monitor.py

@@ -20,6 +20,7 @@ from arguments import AveragerArguments, BaseTrainingArguments, CollaborativeOpt
 
 logger = logging.getLogger(__name__)
 
+from run_trainer import analyze_openfiles_periodically
 
 @dataclass
 class TrainingMonitorArguments(BaseTrainingArguments):