@@ -141,6 +141,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
141141
142142 name = self.config.name
143143 namespace = self.config.namespace
144+ head_cpus = self.config.head_cpus
145+ head_memory = self.config.head_memory
146+ head_gpus = self.config.head_gpus
144147 min_cpu = self.config.min_cpus
145148 max_cpu = self.config.max_cpus
146149 min_memory = self.config.min_memory
@@ -158,6 +161,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
158161 return generate_appwrapper(
159162 name=name,
160163 namespace=namespace,
164+ head_cpus=head_cpus,
165+ head_memory=head_memory,
166+ head_gpus=head_gpus,
161167 min_cpu=min_cpu,
162168 max_cpu=max_cpu,
163169 min_memory=min_memory,
@@ -290,7 +296,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
290296 else:
291297 return False
292298
293- def wait_ready(self, timeout: Optional[int] = None):
299+ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
294300 """
295301 Waits for requested cluster to be ready, up to an optional timeout (s).
296302 Checks every five seconds.
@@ -300,19 +306,32 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
300306 dashboard_ready = False
301307 status = None
302308 time = 0
303- while not ready or not dashboard_ready :
309+ while not ready:
304310 status, ready = self.status(print_to_console=False)
305- dashboard_ready = self.is_dashboard_ready()
306311 if status == CodeFlareClusterStatus.UNKNOWN:
307312 print(
308313 "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
309314 )
310- if not ready or not dashboard_ready:
315+ if not ready:
316+ if timeout and time >= timeout:
317+ raise TimeoutError(
318+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
319+ )
320+ sleep(5)
321+ time += 5
322+ print("Requested cluster is up and running!")
323+
324+ while dashboard_check and not dashboard_ready:
325+ dashboard_ready = self.is_dashboard_ready()
326+ if not dashboard_ready:
311327 if timeout and time >= timeout:
312- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
328+ raise TimeoutError(
329+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
330+ )
313331 sleep(5)
314332 time += 5
315- print("Requested cluster and dashboard are up and running!")
333+ if dashboard_ready:
334+ print("Dashboard is ready!")
316335
317336 def details(self, print_to_console: bool = True) -> RayCluster:
318337 cluster = _copy_to_ray(self)
@@ -640,6 +659,15 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
640659 worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
641660 namespace=rc["metadata"]["namespace"],
642661 dashboard=ray_route,
662+ head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
663+ "resources"
664+ ]["limits"]["cpu"],
665+ head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
666+ "resources"
667+ ]["limits"]["memory"],
668+ head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
669+ "resources"
670+ ]["limits"]["nvidia.com/gpu"],
643671 )
644672
645673
@@ -670,6 +698,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
670698 worker_gpu=cluster.config.num_gpus,
671699 namespace=cluster.config.namespace,
672700 dashboard=cluster.cluster_dashboard_uri(),
701+ head_cpus=cluster.config.head_cpus,
702+ head_mem=cluster.config.head_memory,
703+ head_gpu=cluster.config.head_gpus,
673704 )
674705 if ray.status == CodeFlareClusterStatus.READY:
675706 ray.status = RayClusterStatus.READY
@@ -879,6 +910,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
879910
880911 name = self.config.name
881912 namespace = self.config.namespace
913+ head_cpus = self.config.head_cpus
914+ head_memory = self.config.head_memory
915+ head_gpus = self.config.head_gpus
882916 min_cpu = self.config.min_cpus
883917 max_cpu = self.config.max_cpus
884918 min_memory = self.config.min_memory
@@ -896,6 +930,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
896930 return generate_appwrapper(
897931 name=name,
898932 namespace=namespace,
933+ head_cpus=head_cpus,
934+ head_memory=head_memory,
935+ head_gpus=head_gpus,
899936 min_cpu=min_cpu,
900937 max_cpu=max_cpu,
901938 min_memory=min_memory,
@@ -1028,7 +1065,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
10281065 else:
10291066 return False
10301067
1031- def wait_ready(self, timeout: Optional[int] = None):
1068+ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
10321069 """
10331070 Waits for requested cluster to be ready, up to an optional timeout (s).
10341071 Checks every five seconds.
@@ -1038,19 +1075,32 @@ <h2 class="section-title" id="header-classes">Classes</h2>
10381075 dashboard_ready = False
10391076 status = None
10401077 time = 0
1041- while not ready or not dashboard_ready :
1078+ while not ready:
10421079 status, ready = self.status(print_to_console=False)
1043- dashboard_ready = self.is_dashboard_ready()
10441080 if status == CodeFlareClusterStatus.UNKNOWN:
10451081 print(
10461082 "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
10471083 )
1048- if not ready or not dashboard_ready:
1084+ if not ready:
1085+ if timeout and time >= timeout:
1086+ raise TimeoutError(
1087+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
1088+ )
1089+ sleep(5)
1090+ time += 5
1091+ print("Requested cluster is up and running!")
1092+
1093+ while dashboard_check and not dashboard_ready:
1094+ dashboard_ready = self.is_dashboard_ready()
1095+ if not dashboard_ready:
10491096 if timeout and time >= timeout:
1050- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
1097+ raise TimeoutError(
1098+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
1099+ )
10511100 sleep(5)
10521101 time += 5
1053- print("Requested cluster and dashboard are up and running!")
1102+ if dashboard_ready:
1103+ print("Dashboard is ready!")
10541104
10551105 def details(self, print_to_console: bool = True) -> RayCluster:
10561106 cluster = _copy_to_ray(self)
@@ -1267,6 +1317,9 @@ <h3>Methods</h3>
12671317
12681318 name = self.config.name
12691319 namespace = self.config.namespace
1320+ head_cpus = self.config.head_cpus
1321+ head_memory = self.config.head_memory
1322+ head_gpus = self.config.head_gpus
12701323 min_cpu = self.config.min_cpus
12711324 max_cpu = self.config.max_cpus
12721325 min_memory = self.config.min_memory
@@ -1284,6 +1337,9 @@ <h3>Methods</h3>
12841337 return generate_appwrapper(
12851338 name=name,
12861339 namespace=namespace,
1340+ head_cpus=head_cpus,
1341+ head_memory=head_memory,
1342+ head_gpus=head_gpus,
12871343 min_cpu=min_cpu,
12881344 max_cpu=max_cpu,
12891345 min_memory=min_memory,
@@ -1653,7 +1709,7 @@ <h3>Methods</h3>
16531709</ details >
16541710</ dd >
16551711< dt id ="codeflare_sdk.cluster.cluster.Cluster.wait_ready "> < code class ="name flex ">
1656- < span > def < span class ="ident "> wait_ready</ span > </ span > (< span > self, timeout: Optional[int] = None)</ span >
1712+ < span > def < span class ="ident "> wait_ready</ span > </ span > (< span > self, timeout: Optional[int] = None, dashboard_check: bool = True )</ span >
16571713</ code > </ dt >
16581714< dd >
16591715< div class ="desc "> < p > Waits for requested cluster to be ready, up to an optional timeout (s).
@@ -1662,7 +1718,7 @@ <h3>Methods</h3>
16621718< summary >
16631719< span > Expand source code</ span >
16641720</ summary >
1665- < pre > < code class ="python "> def wait_ready(self, timeout: Optional[int] = None):
1721+ < pre > < code class ="python "> def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
16661722 """
16671723 Waits for requested cluster to be ready, up to an optional timeout (s).
16681724 Checks every five seconds.
@@ -1672,19 +1728,32 @@ <h3>Methods</h3>
16721728 dashboard_ready = False
16731729 status = None
16741730 time = 0
1675- while not ready or not dashboard_ready :
1731+ while not ready:
16761732 status, ready = self.status(print_to_console=False)
1677- dashboard_ready = self.is_dashboard_ready()
16781733 if status == CodeFlareClusterStatus.UNKNOWN:
16791734 print(
16801735 "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
16811736 )
1682- if not ready or not dashboard_ready:
1737+ if not ready:
1738+ if timeout and time >= timeout:
1739+ raise TimeoutError(
1740+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
1741+ )
1742+ sleep(5)
1743+ time += 5
1744+ print("Requested cluster is up and running!")
1745+
1746+ while dashboard_check and not dashboard_ready:
1747+ dashboard_ready = self.is_dashboard_ready()
1748+ if not dashboard_ready:
16831749 if timeout and time >= timeout:
1684- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
1750+ raise TimeoutError(
1751+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
1752+ )
16851753 sleep(5)
16861754 time += 5
1687- print("Requested cluster and dashboard are up and running!")</ code > </ pre >
1755+ if dashboard_ready:
1756+ print("Dashboard is ready!")</ code > </ pre >
16881757</ details >
16891758</ dd >
16901759</ dl >
0 commit comments