Last active
November 3, 2022 20:42
-
-
Save ryul99/01c05fe49478241295f980d5c39578de to your computer and use it in GitHub Desktop.
Hydra DDP test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import hydra | |
import torch | |
import datetime | |
import torch.distributed as dist | |
import torch.multiprocessing as mp | |
from omegaconf import OmegaConf | |
logger = logging.getLogger(os.path.basename(__file__)) | |
def setup(cfg, rank): | |
os.environ["MASTER_ADDR"] = cfg.dist.master_addr | |
os.environ["MASTER_PORT"] = cfg.dist.master_port | |
timeout_sec = 1800 | |
if cfg.dist.timeout is not None: | |
os.environ["NCCL_BLOCKING_WAIT"] = "1" | |
timeout_sec = cfg.dist.timeout | |
timeout = datetime.timedelta(seconds=timeout_sec) | |
# initialize the process group | |
dist.init_process_group( | |
cfg.dist.mode, | |
rank=rank, | |
world_size=cfg.dist.gpus, | |
timeout=timeout, | |
) | |
def cleanup(): | |
dist.destroy_process_group() | |
def distributed_run(fn, cfg): | |
mp.spawn(fn, args=(cfg,), nprocs=cfg.dist.gpus, join=True) | |
def train_loop(rank, cfg): | |
setup(cfg, rank) | |
logger.info("Hi! I'm info from train_loop") | |
logger.warning("Hi! I'm warning from train_loop") | |
logger.error("Hi! I'm error from train_loop") | |
cleanup() | |
@hydra.main(config_path="DDP_conf.yaml") | |
def main(hydra_cfg): | |
logger.info("Hi! I'm info from main function") | |
logger.warning("Hi! I'm warning from main function") | |
logger.error("Hi! I'm error from main function") | |
distributed_run(train_loop, hydra_cfg) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defaults: | |
- hydra/job_logging: colorlog | |
- hydra/hydra_logging: colorlog | |
dist: | |
master_addr: 'localhost' | |
master_port: '12355' | |
mode: 'nccl' | |
gpus: 1 | |
timeout: 30 |
Try to initialize the logging at the top of your training loop with something like this:
hydra_cfg = HydraConfig.instance().get()
configure_log(hydra_cfg.job_logging, hydra_cfg.verbose)
I fixed the file. I forget to add setup and cleanup to train_loop. but I think this is not related to the problem
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You do not call setup, is that function related to the problem?