@@ -374,11 +374,15 @@ def infer_batch(x):
374374
375375 iter_times = []
376376 memcopy_times = []
377+ dequeue_times = []
377378
378- def log_step (step_idx , display_every , iter_time , memcpyHtoD_time ):
379+ def log_step (step_idx , display_every , iter_time , memcpyHtoD_time , dequeue_time ):
379380 if step_idx % display_every == 0 :
380381 print (
381- f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} , memcpyHtoD_time(ms)={ memcpyHtoD_time :.3f} "
382+ f"step { step_idx :04d} , "
383+ f"iter_time(ms)={ iter_time :08.3f} , "
384+ f"memcpyHtoD_time(ms)={ memcpyHtoD_time :08.3f} , "
385+ f"dequeue_time(ms)={ dequeue_time :08.3f} "
382386 )
383387
384388 dataset = timed_dataset (
@@ -401,7 +405,17 @@ def force_data_on_gpu(data, device="/gpu:0"):
401405 output_data = tf .identity (data )
402406 return output_data
403407
404- for step_idx , data_batch in enumerate (dataset ):
408+ step_idx = 0
409+ ds_iter = iter (dataset )
410+
411+ while True :
412+
413+ try :
414+ start_time = time .time ()
415+ data_batch = next (ds_iter )
416+ dequeue_times .append (time .time () - start_time )
417+ except :
418+ break
405419
406420 start_time = time .time ()
407421 data_batch = force_data_on_gpu (data_batch )
@@ -418,11 +432,13 @@ def force_data_on_gpu(data, device="/gpu:0"):
418432 step_idx + 1 ,
419433 display_every = self ._args .display_every ,
420434 iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
421- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
435+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
436+ dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
422437 )
423438 else :
424- print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
425- print (f"{ 'MemCopyHtoD Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
439+ print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:08.4f} s" )
440+ print (f"{ 'Data MemCopyHtoD Time' :18s} : { memcpyHtoD_time [- 1 ]:08.4f} s" )
441+ print (f"{ 'Data Dequeue Time' :18s} : { dequeue_times [- 1 ]:08.4f} s" )
426442
427443 if not self ._args .use_synthetic_data :
428444 data_aggregator .aggregate_data (y_pred , y )
@@ -431,6 +447,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
431447 step_idx + 1 >= self ._args .num_iterations ):
432448 break
433449
450+ step_idx += 1
451+
434452 if (
435453 not self ._args .debug_performance and
436454 step_idx % self ._args .display_every != 0
@@ -439,7 +457,8 @@ def force_data_on_gpu(data, device="/gpu:0"):
439457 step_idx + 1 ,
440458 display_every = 1 , # force print
441459 iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
442- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
460+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
461+ dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
443462 )
444463
445464 with timed_section ("Metric Computation" ):
@@ -458,14 +477,18 @@ def force_data_on_gpu(data, device="/gpu:0"):
458477 )
459478
460479 # Skipping last batch. Might have different batch_size
461- run_times = np .array (iter_times )
462- run_times = run_times [self ._args .num_warmup_iterations :- 1 ]
463- mem_times = np .array (memcopy_times )
464- mem_times = mem_times [self ._args .num_warmup_iterations :- 1 ]
480+ iter_times = np .array (iter_times )
481+ iter_times = iter_times [self ._args .num_warmup_iterations :- 1 ]
482+
483+ memcopy_times = np .array (memcopy_times )
484+ memcopy_times = memcopy_times [self ._args .num_warmup_iterations :- 1 ]
485+
486+ dequeue_times = np .array (dequeue_times )
487+ dequeue_times = dequeue_times [self ._args .num_warmup_iterations :- 1 ]
465488
466489 metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
467- metrics ['Throughput (samples/sec)' ] = np . mean (
468- self ._args .batch_size / run_times
490+ metrics ['Throughput (samples/sec)' ] = (
491+ self ._args .batch_size / np . mean ( iter_times )
469492 )
470493
471494 def timing_metrics (time_arr , log_prefix ):
@@ -479,16 +502,17 @@ def timing_metrics(time_arr, log_prefix):
479502 data [f"{ log_prefix } Max (ms)" ] = np .max (time_arr ) * 1000
480503 return data
481504
482- metrics .update (timing_metrics (run_times , "GPU Latency" ))
483- metrics .update (timing_metrics (mem_times , "MemCopyHtoD Time" ))
505+ metrics .update (timing_metrics (iter_times , "GPU Latency" ))
506+ metrics .update (timing_metrics (dequeue_times , "Data Batch Dequeue Time" ))
507+ metrics .update (timing_metrics (memcopy_times , "Data MemCopyHtoD Time" ))
484508
485509 self ._export_runtime_metrics_to_json (metrics )
486510
487511 def log_value (key , val ):
488512 if isinstance (val , int ):
489- print (f"- { key :40s } : { val } " )
513+ print (f"- { key :45s } : { val } " )
490514 else :
491- print (f"- { key :40s } : { val :.2f} " )
515+ print (f"- { key :45s } : { val :.2f} " )
492516
493517 for key , val in sorted (metrics .items ()):
494518 if isinstance (val , dict ):
0 commit comments