From 3e13b20e58e4106efb8961a5da7fa54b59ddd389 Mon Sep 17 00:00:00 2001
From: rahul-tuli <rtuli@redhat.com>
Date: Thu, 25 Sep 2025 11:42:46 +0000
Subject: [PATCH 1/3] feat: Enable Eagle3 multimodal support for Llama4
 verifier models

- Add SupportsEagle3 interface to Llama4ForConditionalGeneration and Llama4ForCausalLM
- Implement custom auxiliary hidden state layers (1, 23, 44) for Eagle3 speculative decoding
- Enable multimodal input handling in Eagle3LlamaForCausalLM with text-only inference mode
- Add proper dimension adaptation for auxiliary hidden states from multimodal verifiers
- Implement dynamic Eagle3 auxiliary layer configuration from speculators config
- Add GPU model runner method to read eagle_aux_hidden_state_layer_ids from draft config
- Update auxiliary layer configuration logic to use speculative config dynamically
- Simplify model implementations to provide fallback defaults

This is the first successful implementation of Eagle3 speculative decoding with
multimodal Llama4 models, supporting custom layer extraction and text-only
drafter processing while leveraging multimodal context from auxiliary hidden states.
The implementation now dynamically reads auxiliary layer configuration from the
draft model's speculative config, eliminating hardcoded layer IDs.
---
 vllm/model_executor/models/llama.py           |  5 ++
 vllm/model_executor/models/llama4.py          |  1 +
 vllm/model_executor/models/llama_eagle3.py    | 69 ++++++++++++++++---
 vllm/model_executor/models/mllama4.py         | 21 +++++-
 .../configs/speculators/algos.py              |  6 ++
 vllm/v1/worker/gpu_model_runner.py            | 29 +++++++-
 6 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 1b03cbef501b..a66c72cef207 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -572,6 +572,11 @@ def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Override to return default layers for Llama
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """      
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index ddd7e6a5936e..c87bbc075d9b 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -671,6 +671,7 @@ def _init_model(self,
                            prefix=prefix,
                            layer_type=layer_type)
 
+
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index b99a1547918e..8cae9ed41e78 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -21,8 +21,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import (LlamaDecoderLayer,
                                               LlamaForCausalLM)
+from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings
 
 logger = init_logger(__name__)
 
@@ -147,12 +148,20 @@ def __init__(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        input_embeds = self.embed_tokens(input_ids)
-        assert hidden_states.shape[-1] == input_embeds.shape[-1]
+        if inputs_embeds is not None:
+            input_embeds = inputs_embeds
+        else:
+            input_embeds = self.embed_tokens(input_ids)
+
+        # Only check dimension compatibility after we have the input embeddings
+        # For multimodal cases, hidden_states dimensions may differ and need adaptation
+        if hidden_states.shape[-1] != input_embeds.shape[-1]:
+            hidden_states = self.fc(hidden_states)
 
         residual = None
         hidden_states, residual = self.layers[0](
@@ -200,6 +209,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 class Eagle3LlamaForCausalLM(LlamaForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        logger.info("Eagle3LlamaForCausalLM initialized")
         nn.Module.__init__(self)
         self.config = vllm_config. \
             speculative_config.draft_model_config.hf_config
@@ -232,6 +242,35 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             requires_grad=False,
         )
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+
+        # Check if this drafter is configured for text-only inference
+        inference_type = getattr(self.config, 'inference_type', 'multimodal')
+
+        if multimodal_embeddings is not None and inference_type != 'text':
+            # For Eagle3, multimodal content is already processed by the verifier
+            # The auxiliary hidden states contain the multimodal context
+            # So we just return the text embeddings here
+            # Note: merge_multimodal_embeddings requires image_token_index
+            if hasattr(self.config, 'image_token_index'):
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    multimodal_embeddings,
+                    self.config.image_token_index,
+                )
+        elif multimodal_embeddings is not None and inference_type == 'text':
+            # Text-only drafter: ignore multimodal embeddings
+            # The verifier handles all multimodal processing, drafter only processes text tokens
+            pass
+
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -239,11 +278,25 @@ def forward(
         hidden_states: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Eagle3 drafter processes auxiliary hidden states from verifier model
+        # For multimodal inputs, the verifier already processed the multimodal content
+        # and generated auxiliary hidden states that contain this context.
+        # This drafter is configured for text-only inference (inference_type: "text")
+
         if inputs_embeds is not None:
-            raise NotImplementedError(
-                f"{type(self).__name__} does not support multimodal inputs yet."
-            )
-        return self.model(input_ids, positions, hidden_states)
+            # Handle edge cases (e.g., warmup) where pre-computed embeddings are provided
+            input_embeds = inputs_embeds
+        else:
+            # Standard case: use text embeddings for current token prediction
+            input_embeds = self.model.embed_tokens(input_ids)
+
+        # Adapt auxiliary hidden state dimensions if they don't match text embeddings
+        # Critical for multimodal models where auxiliary hidden states may have different dimensions
+        if hidden_states.shape[-1] != input_embeds.shape[-1]:
+            hidden_states = self.model.fc(hidden_states)
+
+        # Eagle3 architecture: combines text embeddings + multimodal hidden states
+        return self.model(None, positions, hidden_states, input_embeds)
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 79e315f79489..b46eca9c2890 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -54,7 +54,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsEagle3,
+                         SupportsMultiModal, SupportsPP)
 from .llama4 import Llama4ForCausalLM
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -710,7 +711,7 @@ def get_dummy_mm_data(
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
 class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                     SupportsPP):
+                                     SupportsPP, SupportsEagle3):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -759,6 +760,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        """Set which layers should output auxiliary hidden states for EAGLE3."""
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, 'set_aux_hidden_state_layers')
+        self.language_model.set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Get the layer indices for auxiliary hidden state outputs.
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, 'get_eagle3_aux_hidden_state_layers')
+        self.language_model.get_eagle3_aux_hidden_state_layers()
+
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
         # num_images, 1, num_chunks, channel, image_size, image_size
diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py
index efc87b6bcf26..f727587fe2f6 100644
--- a/vllm/transformers_utils/configs/speculators/algos.py
+++ b/vllm/transformers_utils/configs/speculators/algos.py
@@ -30,3 +30,9 @@ def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
     vllm_config["norm_before_residual"] = config_dict.get(
         "norm_before_residual", True)
     vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
+    if config_dict.get("eagle_aux_hidden_state_layer_ids"):
+        vllm_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
+            "eagle_aux_hidden_state_layer_ids"]
+    if config_dict.get("inference_type"):
+        vllm_config["inference_type"] = config_dict["inference_type"]
+        
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 17f8be86af2f..3a6651a72963 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2587,8 +2587,15 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                 self.drafter.load_model(self.model)
             if self.use_aux_hidden_state_outputs:
                 if supports_eagle3(self.model):
-                    self.model.set_aux_hidden_state_layers(
-                        self.model.get_eagle3_aux_hidden_state_layers())
+                    # Get auxiliary layers from speculative config if available
+                    aux_layers = self._get_eagle3_aux_layers_from_config()
+                    if aux_layers is not None:
+                        logger.info(f"Using auxiliary layers from speculative config: {aux_layers}")
+                        self.model.set_aux_hidden_state_layers(aux_layers)
+                    else:
+                        # Fallback to model's default implementation
+                        self.model.set_aux_hidden_state_layers(
+                            self.model.get_eagle3_aux_hidden_state_layers())
                 else:
                     raise RuntimeError(
                         "Model does not support EAGLE3 interface but "
@@ -2638,6 +2645,24 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             else:
                 self.model = UBatchWrapper(self.model, self.vllm_config,
                                            CUDAGraphMode.NONE, self.device)
+    def _get_eagle3_aux_layers_from_config(self) -> Optional[tuple[int, ...]]:
+        """
+        Extract Eagle3 auxiliary layer IDs from the speculative config.
+
+        Returns:
+            Tuple of layer indices from draft model config, or None if not found.
+        """
+        try:
+            if (self.speculative_config and
+                self.speculative_config.draft_model_config and
+                hasattr(self.speculative_config.draft_model_config.hf_config,
+                    'eagle_aux_hidden_state_layer_ids')):
+                layer_ids = self.speculative_config.draft_model_config.hf_config.eagle_aux_hidden_state_layer_ids
+                if layer_ids and isinstance(layer_ids, (list, tuple)):
+                    return tuple(layer_ids)
+        except Exception as e:
+            logger.warning(f"Failed to read auxiliary layers from speculative config: {e}")
+        return None
 
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \

From 780c07292ffd9a58d9202379b7e381d26ebc5e9d Mon Sep 17 00:00:00 2001
From: rahul-tuli <rtuli@redhat.com>
Date: Fri, 26 Sep 2025 20:47:26 +0000
Subject: [PATCH 2/3] Some fixes

---
 vllm/model_executor/models/qwen2_5_vl.py | 20 ++++++++++++++++++--
 vllm/v1/spec_decode/eagle.py             |  6 +++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index b740e6d87b74..0f4417b258f3 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -66,7 +66,7 @@
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP, SupportsQuant)
+                         SupportsMultiModal, SupportsPP, SupportsQuant, SupportsEagle3)
 from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
 from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
                        apply_rotary_pos_emb_vision)
@@ -912,7 +912,7 @@ def _get_mm_fields_config(
     dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
 class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsLoRA, SupportsPP,
-                                         SupportsQuant):
+                                         SupportsQuant, SupportsEagle3):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -1137,6 +1137,22 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
+    
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        """Set which layers should output auxiliary hidden states for EAGLE3."""
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, 'set_aux_hidden_state_layers')
+        self.get_language_model().set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Get the layer indices for auxiliary hidden state outputs.
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, 'get_eagle3_aux_hidden_state_layers')
+        self.language_model.get_eagle3_aux_hidden_state_layers()
 
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 119f41d8580e..fe21560d473f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -808,8 +808,12 @@ def load_model(self, target_model: nn.Module) -> None:
 
         if supports_multimodal(target_model):
             # handle multimodality
-            self.model.config.image_token_index = (
+            if hasattr(target_model.config, "image_token_index"):
+                self.model.config.image_token_index = (
                 target_model.config.image_token_index)
+            elif hasattr(draft_model_config, "image_token_id"):
+                self.model.config.image_token_index = (
+                target_model.image_token_id)
             target_language_model = target_model.get_language_model()
         else:
             target_language_model = target_model

From 2aa1fcc0a6ef899e6b1dea8e340805069bb105be Mon Sep 17 00:00:00 2001
From: rahul-tuli <rtuli@redhat.com>
Date: Fri, 26 Sep 2025 21:00:07 +0000
Subject: [PATCH 3/3] fix: Resolve Qwen2.5VL Eagle3 support issues

- Fix aux_hidden_state_layers initialization syntax error in qwen2.py
- Add missing return statement in qwen2_5_vl.py get_eagle3_aux_hidden_state_layers
- Improve error handling with hasattr check instead of assert
- Clean up method delegation to use direct return from language_model
- Add fallback default auxiliary layers for Qwen2.5VL models

These fixes enable Eagle3 speculative decoding support for Qwen2.5VL models.
Successfully tested with Qwen2.5VL-7B + Eagle3 configuration.
---
 vllm/model_executor/models/qwen2.py      | 2 +-
 vllm/model_executor/models/qwen2_5_vl.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index c536b0f60c30..cf8219e3445e 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -333,7 +333,7 @@ def __init__(self,
         else:
             self.norm = PPMissingLayer()
 
-        self.aux_hidden_state_layers = tuple[int, ...]()
+        self.aux_hidden_state_layers = tuple()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0f4417b258f3..b145eaa091dc 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1150,9 +1150,7 @@ def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         Note: The GPU model runner will override this with layers from
         the speculative config if available, providing dynamic configuration.
         """
-        # Delegate to underlying language model (Llama4ForCausalLM)
-        assert hasattr(self.language_model, 'get_eagle3_aux_hidden_state_layers')
-        self.language_model.get_eagle3_aux_hidden_state_layers()
+        return self.language_model.get_eagle3_aux_hidden_state_layers()
 
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings: