@@ -97,15 +97,15 @@ def _config_gpu_memory(self, gpu_mem_cap):
9797 tf .config .set_memory_growth (gpu , True )
9898 except AttributeError :
9999 tf .config .experimental .set_memory_growth (gpu , True )
100-
100+
101101 else :
102102 try :
103103 set_virtual_device_configuration = tf .config .set_virtual_device_configuration
104104 device_config = tf .config .LogicalDeviceConfiguration (memory_limit = gpu_mem_cap )
105105 except AttributeError :
106106 set_virtual_device_configuration = tf .config .experimental .set_virtual_device_configuration
107107 device_config = tf .config .experimental .VirtualDeviceConfiguration (memory_limit = gpu_mem_cap )
108-
108+
109109 set_virtual_device_configuration (gpu , [device_config ])
110110 except RuntimeError as e :
111111 print ('Can not set GPU memory config' , e )
@@ -378,18 +378,40 @@ def infer_batch(x):
378378 )
379379
380380 iter_times = []
381+ memcopy_times = []
381382
382- def log_step (step_idx , display_every , iter_time ):
383+ def log_step (step_idx , display_every , iter_time , memcpyHtoD_time ):
383384 if step_idx % display_every == 0 :
384385 print (
385- f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} "
386+ f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} , memcpyHtoD_time(ms)= { memcpyHtoD_time :.3f } "
386387 )
387388
388389 dataset = timed_dataset (
389390 dataset , activate = self ._args .debug_performance
390391 )
391392
393+ @force_gpu_resync
394+ @tf .function ()
395+ def force_data_on_gpu (data , device = "/gpu:0" ):
396+ with tf .device (device ):
397+ if isinstance (data , (list , tuple )):
398+ output_data = list ()
399+ for t in data :
400+ output_data .append (tf .identity (t ))
401+ elif isinstance (data , dict ):
402+ output_data = dict ()
403+ for k , v in data .items ():
404+ output_data [k ] = tf .identity (v )
405+ else :
406+ output_data = tf .identity (data )
407+ return output_data
408+
392409 for step_idx , data_batch in enumerate (dataset ):
410+
411+ start_time = time .time ()
412+ data_batch = force_data_on_gpu (data_batch )
413+ memcopy_times .append (time .time () - start_time )
414+
393415 x , y = self .preprocess_model_inputs (data_batch )
394416
395417 start_time = time .time ()
@@ -398,11 +420,14 @@ def log_step(step_idx, display_every, iter_time):
398420
399421 if not self ._args .debug_performance :
400422 log_step (
401- step_idx + 1 , self ._args .display_every ,
402- np .mean (iter_times [- self ._args .display_every :]) * 1000
423+ step_idx + 1 ,
424+ display_every = self ._args .display_every ,
425+ iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
426+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
403427 )
404428 else :
405429 print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
430+ print (f"{ 'MemCopyHtoD Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
406431
407432 if not self ._args .use_synthetic_data :
408433 data_aggregator .aggregate_data (y_pred , y )
@@ -411,15 +436,15 @@ def log_step(step_idx, display_every, iter_time):
411436 step_idx + 1 >= self ._args .num_iterations ):
412437 break
413438
414- if (not self ._args .debug_performance and
415- step_idx % self ._args .display_every !=
416- 0 ): # avoids double printing
439+ if (
440+ not self ._args .debug_performance and
441+ step_idx % self ._args .display_every != 0
442+ ): # avoids double printing
417443 log_step (
418444 step_idx + 1 ,
419445 display_every = 1 , # force print
420- iter_time = (
421- np .mean (iter_times [- self ._args .display_every :]) * 1000
422- )
446+ iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
447+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
423448 )
424449
425450 with timed_section ("Metric Computation" ):
@@ -440,26 +465,35 @@ def log_step(step_idx, display_every, iter_time):
440465 # Skipping last batch. Might have different batch_size
441466 run_times = np .array (iter_times )
442467 run_times = run_times [self ._args .num_warmup_iterations :- 1 ]
468+ mem_times = np .array (memcopy_times )
469+ mem_times = mem_times [self ._args .num_warmup_iterations :- 1 ]
443470
444471 metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
445472 metrics ['Throughput (samples/sec)' ] = np .mean (
446473 self ._args .batch_size / run_times
447474 )
448- metrics ['99th_percentile (ms)' ] = np .percentile (
449- run_times , q = 99 , interpolation = 'lower'
450- ) * 1000
451- metrics ['GPU Latency Mean (ms)' ] = np .mean (run_times ) * 1000
452- metrics ['GPU Latency Median (ms)' ] = np .median (run_times ) * 1000
453- metrics ['GPU Latency Min (ms)' ] = np .min (run_times ) * 1000
454- metrics ['GPU Latency Max (ms)' ] = np .max (run_times ) * 1000
475+
476+ def timing_metrics (time_arr , log_prefix ):
477+ data = dict ()
478+ data [f"{ log_prefix } 99th_percentile (ms)" ] = np .percentile (
479+ time_arr , q = 99 , interpolation = 'lower'
480+ ) * 1000
481+ data [f"{ log_prefix } Mean (ms)" ] = np .mean (time_arr ) * 1000
482+ data [f"{ log_prefix } Median (ms)" ] = np .median (time_arr ) * 1000
483+ data [f"{ log_prefix } Min (ms)" ] = np .min (time_arr ) * 1000
484+ data [f"{ log_prefix } Max (ms)" ] = np .max (time_arr ) * 1000
485+ return data
486+
487+ metrics .update (timing_metrics (run_times , "GPU Latency" ))
488+ metrics .update (timing_metrics (mem_times , "MemCopyHtoD Time" ))
455489
456490 self ._export_runtime_metrics_to_json (metrics )
457491
458492 def log_value (key , val ):
459493 if isinstance (val , int ):
460- print (f"- { key :35s } : { val } " )
494+ print (f"- { key :40s } : { val } " )
461495 else :
462- print (f"- { key :35s } : { val :.2f} " )
496+ print (f"- { key :40s } : { val :.2f} " )
463497
464498 for key , val in sorted (metrics .items ()):
465499 if isinstance (val , dict ):
0 commit comments