From d32c9af4bc66b416a74cb48d1164e7aea0a9f8f6 Mon Sep 17 00:00:00 2001 From: "Brian W. Hart" Date: Thu, 25 Apr 2019 19:43:33 +0000 Subject: [PATCH] imagenet: add rank indicator to progress summary when distributed The imagenet example supports distribution. When run in distributed mode multiple ranks will be performing training and testing, and all will be producing progress meter output. The output from the various ranks is interleaved, and it's ambiguous which rank produced any specific bit of progress output. This change adds a rank indicator to the progress prefix when distribution is in force: Non-distributed: $ python main.py --epochs 1 ... => creating model 'resnet18' Epoch: [0][ 0/5005] Time 23.173 (23.173) ... ... Test: [ 0/196] Time 10.899 (10.899) ... ... Distributed (note the additional [0] and [1] in the prefixes): $ python main.py --epochs 1 --dist-url 'tcp://127.0.0.1:2200' \ --dist-backend 'nccl' --multiprocessing-distributed --world-size 1 --rank 0 ... Use GPU: 1 for training Use GPU: 0 for training => creating model 'resnet18' => creating model 'resnet18' Epoch: [0][0][ 0/5005] Time 20.770 (20.770) ... Epoch: [0][1][ 0/5005] Time 20.771 (20.771) ... ... Test[0]: [ 0/391] Time 7.295 ( 7.295) ... Test[1]: [ 0/391] Time 7.188 ( 7.188) ... ... --- imagenet/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/imagenet/main.py b/imagenet/main.py index 9cc4937c3f..a69442daba 100644 --- a/imagenet/main.py +++ b/imagenet/main.py @@ -263,7 +263,7 @@ def train(train_loader, model, criterion, optimizer, epoch, args): top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_loader), batch_time, data_time, losses, top1, - top5, prefix="Epoch: [{}]".format(epoch)) + top5, prefix="Epoch: [{}]{}".format(epoch, rank_indicator(args))) # switch to train mode model.train() @@ -306,7 +306,7 @@ def validate(val_loader, model, criterion, args): top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5, - prefix='Test: ') + prefix='Test{}: '.format(rank_indicator(args))) # switch to evaluate mode model.eval() @@ -348,6 +348,15 @@ def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): shutil.copyfile(filename, 'model_best.pth.tar') +def rank_indicator(args): + if args.distributed: + digits = len(str(args.world_size)) + indicator = '[{rank:{width}}]'.format(rank=args.rank, width=digits) + else: + indicator = '' + return indicator + + class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self, name, fmt=':f'):