99import prometheus_client
1010from datetime import datetime
1111from collections import defaultdict
12+ from socket import gethostbyname
1213
1314utc_format = '%Y-%m-%dT%H:%M:%S'
1415
@@ -19,32 +20,39 @@ def generate_ads(entries):
1920
2021def last_jobs_dict (collector ):
2122 last_job = defaultdict (dict )
22-
23+
2324 for collector in args :
2425 schedd_ads = locate_schedds (collector )
2526 if schedd_ads is None :
2627 return None
27-
28+
2829 for s in schedd_ads :
2930 last_job [s .get ('Name' )] = {'ClusterId' : None , 'EnteredCurrentStatus' : None }
3031
3132 return last_job
32-
3333
34- def locate_schedds (collector ):
35- try :
36- coll = htcondor .Collector (collector )
37- return coll .locateAll (htcondor .DaemonTypes .Schedd )
38- except htcondor .HTCondorIOError as e :
39- failed = e
40- logging .error (f'Condor error: { e } ' )
34+
35+ def locate_schedds (collector , access_points ):
36+ coll = htcondor .Collector (collector )
37+ schedds = []
38+ if access_points :
39+ try :
40+ for ap in access_points :
41+ schedds .append (coll .locate (htcondor .DaemonTypes .Schedd , ap ))
42+ except htcondor .HTCondorIOError as e :
43+ logging .error (f'Condor error: { e } ' )
44+ else :
45+ try :
46+ schedds .append (coll .locateAll (htcondor .DaemonTypes .Schedd ))
47+ except htcondor .HTCondorIOError as e :
48+ logging .error (f'Condor error: { e } ' )
4149
4250def compose_ad_metrics (ad , metrics ):
4351 ''' Parse condor job classad and update metrics
4452
4553 Args:
4654 ad (classad): an HTCondor job classad
47- metrics (JobMetrics): JobMetrics object
55+ metrics (JobMetrics): JobMetrics object
4856 '''
4957 # ignore this ad if walltimehrs is negative or a dagman
5058 if ad ['walltimehrs' ] < 0 or ad ['Cmd' ] == '/usr/bin/condor_dagman' :
@@ -64,7 +72,7 @@ def compose_ad_metrics(ad, metrics):
6472 labels ['site' ] = ad ['site' ]
6573 labels ['schedd' ] = ad ['GlobalJobId' ][0 :ad ['GlobalJobId' ].find ('#' )]
6674 labels ['GPUDeviceName' ] = None
67-
75+
6876 if ad ['ExitCode' ] == 0 and ad ['ExitBySignal' ] is False and ad ['JobStatus' ] == 4 :
6977 labels ['usage' ] = 'goodput'
7078 else :
@@ -83,7 +91,7 @@ def compose_ad_metrics(ad, metrics):
8391 resource_hrs = ad ['cpuhrs' ]
8492 resource_request = ad ['RequestCpus' ]
8593
86- try :
94+ try :
8795 labels ['IceProdDataset' ] = ad ['IceProdDataset' ]
8896 labels ['IceProdTaskName' ] = ad ['IceProdTaskName' ]
8997 except :
@@ -100,39 +108,27 @@ def compose_ad_metrics(ad, metrics):
100108 metrics .condor_job_mem_req .labels (** labels ).observe (ad ['RequestMemory' ]/ 1024 )
101109 metrics .condor_job_mem_used .labels (** labels ).observe (ad ['ResidentSetSize_RAW' ]/ 1048576 )
102110
103- def query_collector (collector , metrics , last_job ):
111+ def query_collector (collector , access_points , metrics , last_job ):
104112 """Query schedds for job ads
105113
106114 Args:
107115 collector (str): address for a collector to query
108116 metrics (JobMetrics): JobMetrics instance
109117 last_job (dict): dictionary for tracking last ClusterId by schedd
110118 """
111- for schedd_ad in locate_schedds (collector ):
119+ for schedd_ad in locate_schedds (collector , access_points ):
112120 name = schedd_ad .get ('Name' )
113121
114122 ads = read_from_schedd (schedd_ad , history = True , since = last_job [name ]['ClusterId' ])
115- if last_job [name ]['EnteredCurrentStatus' ] is not None :
116- logging .info (f'{ name } - read ads since { last_job [name ]["ClusterId" ]} :{ last_job [name ]["EnteredCurrentStatus" ]} at timestamp { datetime .strptime (last_job [name ]["EnteredCurrentStatus" ],utc_format )} ' )
117-
118- for ad in generate_ads (ads ):
119- if last_job [name ]['ClusterId' ] is None :
120- last_job [name ]['ClusterId' ] = int (ad ['ClusterId' ])
121- last_job [name ]['EnteredCurrentStatus' ] = ad ['EnteredCurrentStatus' ]
122-
123- if datetime .strptime (ad ['EnteredCurrentStatus' ],utc_format ) > datetime .strptime (last_job [name ]['EnteredCurrentStatus' ],utc_format ):
124- last_job [name ]['ClusterId' ] = int (ad ['ClusterId' ])
125- last_job [name ]['EnteredCurrentStatus' ] = ad ['EnteredCurrentStatus' ]
126-
127- compose_ad_metrics (ad , metrics )
123+ iterate_ads (ads , name , metrics )
128124
129125def read_from_schedd (schedd_ad , history = False , constraint = 'true' , projection = [],match = 10000 ,since = None ):
130126 """Connect to schedd and pull ads directly.
131127
132128 A generator that yields condor job dicts.
133129
134130 Args:
135- schedd (ClassAd): location_add of a schedd, from either htcondor.Colletor locate() or locateAll()
131+ schedd (ClassAd): location_add of a schedd, from either htcondor.Colletor locate() or locateAll()
136132 history (bool): read history (True) or active queue (default: False)
137133 constraint (string): string representation of a classad expression
138134 match (int): number of job ads to return
@@ -158,6 +154,21 @@ def read_from_schedd(schedd_ad, history=False, constraint='true', projection=[],
158154 except Exception :
159155 logging .info ('%s failed' , schedd_ad ['Name' ], exc_info = True )
160156
157+ def iterate_ads (ads , name , metrics , last_job ):
158+ if last_job [name ]['EnteredCurrentStatus' ] is not None :
159+ logging .info (f'{ name } - read ads since { last_job [name ]["ClusterId" ]} :{ last_job [name ]["EnteredCurrentStatus" ]} at timestamp { datetime .strptime (last_job [name ]["EnteredCurrentStatus" ],utc_format )} ' )
160+
161+ for ad in generate_ads (ads ):
162+ if last_job [name ]['ClusterId' ] is None :
163+ last_job [name ]['ClusterId' ] = int (ad ['ClusterId' ])
164+ last_job [name ]['EnteredCurrentStatus' ] = ad ['EnteredCurrentStatus' ]
165+
166+ if datetime .strptime (ad ['EnteredCurrentStatus' ],utc_format ) > datetime .strptime (last_job [name ]['EnteredCurrentStatus' ],utc_format ):
167+ last_job [name ]['ClusterId' ] = int (ad ['ClusterId' ])
168+ last_job [name ]['EnteredCurrentStatus' ] = ad ['EnteredCurrentStatus' ]
169+
170+ compose_ad_metrics (ad , metrics )
171+
161172if __name__ == '__main__' :
162173 logging .basicConfig (level = logging .INFO , format = '%(asctime)s %(levelname)s %(name)s : %(message)s' )
163174
@@ -168,6 +179,7 @@ def read_from_schedd(schedd_ad, history=False, constraint='true', projection=[],
168179 # TODO: Add file tail function for condor history files
169180 #parser.add_option('-f','--histfile',
170181 # help='history file to read from')
182+ parser .add_option ('-a' ,'--access_points' ,default = None )
171183 parser .add_option ('-p' ,'--port' , default = 9100 ,
172184 action = 'store' , type = 'int' ,
173185 help = 'port number for prometheus exporter' )
@@ -196,10 +208,10 @@ def read_from_schedd(schedd_ad, history=False, constraint='true', projection=[],
196208 while True :
197209 start = time .time ()
198210 for collector in args :
199- query_collector (collector , metrics , last_job )
211+ query_collector (collector , options . access_points , metrics , last_job )
200212
201213 delta = time .time () - start
202214 # sleep for interval minus scrape duration
203215 # if scrape duration was longer than interval, run right away
204216 if delta < options .interval :
205- time .sleep (options .interval - delta )
217+ time .sleep (options .interval - delta )
0 commit comments