Skip to content

Commit 0e83a7c

Browse files
authored
fix(bug): Prevent node duplication during leftover node allocation (#209)
1 parent b76a93a commit 0e83a7c

File tree

2 files changed

+40
-0
lines changed

2 files changed

+40
-0
lines changed

src/scheduling/layer_allocation.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ def __init__(
112112

113113
self.layer_to_load: Dict[int, LayerLoad] = {}
114114
self.node_id_to_node: Dict[str, Node] = {}
115+
# Sync dict with initial nodes; prevents declare() from adding duplicates
116+
# when allocate_left_over_nodes() processes unallocated nodes
117+
for node in self.nodes:
118+
self.node_id_to_node[node.node_id] = node
115119

116120
# Pipeline endpoints for routing
117121
self.embedding_node_ids: List[str] = []

tests/scheduler_tests/test_layer_allocation.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def _test_gap_patch_rebalance(allocator: BaseLayerAllocator):
100100
per_node_mem = {
101101
nid: (node.per_decoder_layer_kv_cache_memory or 0)
102102
for nid, node in allocator.node_id_to_node.items()
103+
if nid in allocator.node_allocation
103104
}
104105
min_mem = min(per_node_mem.values()) if per_node_mem else 0
105106

@@ -279,3 +280,38 @@ def test_pipeline_required_with_midrange_only(strategy: Literal["greedy", "dp"])
279280
assert len(assigned) >= 2
280281
total = sum(e - s for _, s, e in assigned)
281282
assert total == model.num_layers
283+
284+
285+
@pytest.mark.parametrize("strategy", ["greedy", "dp"])
286+
def test_allocator_does_not_duplicate_leftover_nodes(strategy: Literal["greedy", "dp"]):
287+
"""Both allocators should not duplicate self.nodes when left over after allocation.
288+
289+
Greedy: builds multiple pipelines, leaves nodes that can't form another pipeline
290+
DP: optimizes for best pipeline configuration, may leave suboptimal nodes unallocated
291+
"""
292+
model = build_model_info(12)
293+
294+
if strategy == "greedy":
295+
# Two strong nodes form two complete pipelines (greedy maximizes pipelines)
296+
# One weak node left unallocated
297+
a100_1 = _build_node("a100-80g", model, id_suffix="-a1")
298+
a100_2 = _build_node("a100-80g", model, id_suffix="-a2")
299+
r1 = _build_node("rtx4090", model, id_suffix="-1")
300+
nodes = [a100_1, a100_2, r1]
301+
expected_node_count = 3
302+
else:
303+
# One strong node handles all layers (DP finds this optimal)
304+
# One weak node left unallocated
305+
a100 = _build_node("a100-80g", model, id_suffix="-a")
306+
r1 = _build_node("rtx4090", model, id_suffix="-1")
307+
nodes = [a100, r1]
308+
expected_node_count = 2
309+
310+
alloc = (
311+
GreedyLayerAllocator(model, nodes)
312+
if strategy == "greedy"
313+
else DynamicProgrammingLayerAllocator(model, nodes)
314+
)
315+
ok = alloc.global_allocation()
316+
assert ok is True
317+
assert len(alloc.nodes) == expected_node_count, "Should not duplicate nodes during allocation"

0 commit comments

Comments
 (0)