quic
diff --git a/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 0 additions & 2 deletions b/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎QEfficient/generation/vlm_generation.py‎
Lines changed: 16 additions & 0 deletions b/‎QEfficient/generation/vlm_generation.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎QEfficient/transformers/cache_utils.py‎
Lines changed: 9 additions & 5 deletions b/‎QEfficient/transformers/cache_utils.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py‎
Lines changed: 10 additions & 0 deletions b/‎QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py‎
Lines changed: 10 additions & 0 deletions
@@ -814,14 +814,12 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
             self.list_of_comp_ctx_lengths_prefill = [np.zeros(length) for length in self.comp_ctx_lengths_prefill]
             prefill_ccl_id = 0
             inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
-            print(f"CCL Prefill: {self.comp_ctx_lengths_prefill[prefill_ccl_id]}")
 
         for i in range(num_chunks):
             if self.comp_ctx_lengths_prefill is not None:
                 if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths_prefill[prefill_ccl_id]:
                     prefill_ccl_id = min(prefill_ccl_id + 1, len(self.comp_ctx_lengths_prefill) - 1)
                     inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
-                    print(f"CCL Prefill: {self.comp_ctx_lengths_prefill[prefill_ccl_id]}")
 
             chunk_inputs = inputs.copy()
             chunk_inputs["input_ids"] = inputs["input_ids"][
 
@@ -83,6 +83,8 @@ def __init__(
         vision_qpc_path: str,
         device_id: Optional[List[int]] = None,
         ctx_len: Optional[int] = None,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
         write_io_dir: Optional[str] = None,
         full_batch_size: Optional[int] = None,
@@ -123,6 +125,8 @@ def __init__(
             qpc_path=lang_qpc_path,
             full_batch_size=full_batch_size,
             ctx_len=ctx_len,
+            comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
+            comp_ctx_lengths_decode=comp_ctx_lengths_decode,
             device_id=device_id,
             enable_debug_logs=enable_debug_logs,
             write_io_dir=write_io_dir,
@@ -294,6 +298,11 @@ def _execute_chunked_prefill(
         outputs = None
         chunk_image_idx = None
 
+        if self.comp_ctx_lengths_prefill is not None:
+            self.list_of_comp_ctx_lengths_prefill = [np.zeros(length) for length in self.comp_ctx_lengths_prefill]
+            prefill_ccl_id = 0
+            lang_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
+
         for i in range(num_chunks):
             input_ids_slice = lang_inputs["input_ids"][:, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len]
             position_ids_slice = lang_inputs["position_ids"][
@@ -312,6 +321,13 @@ def _execute_chunked_prefill(
             if "cross_attention_mask" in lang_inputs:
                 chunk_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"]
 
+            if self.comp_ctx_lengths_prefill is not None:
+                if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths_prefill[prefill_ccl_id]:
+                    prefill_ccl_id = min(prefill_ccl_id + 1, len(self.comp_ctx_lengths_prefill) - 1)
+                    lang_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
+
+                chunk_inputs["comp_ctx_lengths"] = lang_inputs["comp_ctx_lengths"]
+
             outputs = self._session.run(chunk_inputs)
 
             if "image_idx_output" in outputs:
 
@@ -622,6 +622,7 @@ def update(
             is_sliding_layer = cache_kwargs.get("is_sliding")
             sliding_window = cache_kwargs.get("sliding_window")
             batch_index = cache_kwargs.get("batch_index", None)  # Check and fetch batch index value from the kwargs
+            comp_ctx_len = cache_kwargs.get("CCL")
 
             if is_sliding_layer:
                 kv_position_ids = torch.where(position_ids == -1, position_ids, position_ids % sliding_window)
@@ -649,7 +650,10 @@ def update(
             k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
 
             # Original Gather
-            ctx_len = self.key_cache[layer_idx].shape[2]
+            if is_sliding_layer:
+                ctx_len = k_out.shape[2]
+            else:
+                ctx_len = comp_ctx_len
             ctx_indices = torch.arange(ctx_len)[None, None, ...]
             gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
             invalid_mask = ctx_indices > gather_limit
@@ -660,11 +664,11 @@ def update(
             ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
 
             if batch_index is not None:
-                k_out = CtxGatherFuncCB.apply(k_out, batch_index, ctx_indices)
-                v_out = CtxGatherFuncCB.apply(v_out, batch_index, ctx_indices)
+                k_out = CtxGatherFuncCB.apply(k_out, batch_index, ctx_indices, ctx_len)
+                v_out = CtxGatherFuncCB.apply(v_out, batch_index, ctx_indices, ctx_len)
             else:
-                k_out = CtxGatherFunc.apply(k_out, ctx_indices)
-                v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+                k_out = CtxGatherFunc.apply(k_out, ctx_indices, ctx_len)
+                v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
 
             v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
         return k_out, v_out
@@ -417,6 +417,7 @@ def forward(
         attention_mask: Optional[torch.Tensor],
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
+        comp_ctx_lengths: Optional[torch.LongTensor] = None,
         batch_index: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         sliding_mask=None,
@@ -433,6 +434,8 @@ def forward(
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
+            if comp_ctx_lengths is not None:
+                attention_mask = attention_mask[:, :, :, : comp_ctx_lengths.shape[-1]]
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {
                 "sin": sin,
@@ -442,6 +445,7 @@ def forward(
                 "config": self.config,
                 "is_sliding": self.sliding_window is not None,
                 "sliding_window": past_key_value.sliding_window_len,
+                "CCL": attention_mask.shape[-1],
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
@@ -476,6 +480,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
+        comp_ctx_lengths: Optional[torch.LongTensor] = None,
         batch_index: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
@@ -492,6 +497,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            comp_ctx_lengths=comp_ctx_lengths,
             batch_index=batch_index,
             use_cache=use_cache,
             cache_position=cache_position,
@@ -526,6 +532,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
+        comp_ctx_lengths: Optional[torch.LongTensor] = None,
         batch_index: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
@@ -586,6 +593,7 @@ def forward(
                 attention_mask=causal_mask,
                 position_ids=position_ids,
                 past_key_value=past_key_values,
+                comp_ctx_lengths=comp_ctx_lengths,
                 batch_index=batch_index,
                 use_cache=use_cache,
                 output_attentions=output_attentions,
@@ -619,6 +627,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
+        comp_ctx_lengths: Optional[torch.LongTensor] = None,
         batch_index: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
@@ -670,6 +679,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
+            comp_ctx_lengths=comp_ctx_lengths,
             batch_index=batch_index,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,