@@ -132,56 +132,6 @@ def count_tokens(self, messages: Tokenizable) -> int: # type: ignore[override]
132132 # Re-expose for mypy – same implementation as parent.
133133 return super ().count_tokens (messages )
134134
135- def _estimate_structured_output_overhead (self , response_format : ResolvedResponseFormat ) -> int :
136- """Use Google-specific response schema token estimation.
137-
138- Args:
139- response_format: Pydantic model class defining the response format
140-
141- Returns:
142- Estimated token overhead for structured output
143- """
144- return self ._estimate_response_schema_tokens (response_format )
145-
146- def _get_max_output_tokens (self , request : FenicCompletionsRequest ) -> Optional [int ]:
147- """Get maximum output tokens including thinking budget.
148-
149- If max_completion_tokens is not set, return None.
150-
151- Conservative estimate that includes both completion tokens and
152- thinking token budget with a safety margin.
153-
154- Args:
155- request: The completion request
156-
157- Returns:
158- Maximum output tokens (completion + thinking budget with safety margin)
159- """
160- if request .max_completion_tokens is None :
161- return None
162- profile_config = self ._profile_manager .get_profile_by_name (
163- request .model_profile
164- )
165- return request .max_completion_tokens + int (
166- 1.5 * profile_config .thinking_token_budget
167- )
168-
169- @cache # noqa: B019 – builtin cache OK here.
170- def _estimate_response_schema_tokens (self , response_format : ResolvedResponseFormat ) -> int :
171- """Estimate token count for a response format schema.
172-
173- Uses Google's tokenizer to count tokens in a JSON schema representation
174- of the response format. Results are cached for performance.
175-
176- Args:
177- response_format: Pydantic model class defining the response format
178-
179- Returns:
180- Estimated token count for the response format
181- """
182- schema_str = response_format .schema_fingerprint
183- return self ._token_counter .count_tokens (schema_str )
184-
185135 def get_request_key (self , request : FenicCompletionsRequest ) -> str :
186136 """Generate a unique key for the request.
187137
@@ -196,19 +146,17 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
196146 def estimate_tokens_for_request (self , request : FenicCompletionsRequest ):
197147 """Estimate the number of tokens for a request.
198148
149+ If the request provides a max_completion_tokens value, use that. Otherwise, estimate the output tokens based on the file size.
150+
199151 Args:
200152 request: The request to estimate tokens for
201153
202154 Returns:
203155 TokenEstimate: The estimated token usage
204156 """
205-
206- # Count input tokens
207157 input_tokens = self .count_tokens (request .messages )
208158 input_tokens += self ._count_auxiliary_input_tokens (request )
209-
210- output_tokens = self ._get_max_output_tokens (request ) or self ._model_parameters .max_output_tokens
211-
159+ output_tokens = self ._estimate_output_tokens (request )
212160 return TokenEstimate (input_tokens = input_tokens , output_tokens = output_tokens )
213161
214162 async def make_single_request (
@@ -228,7 +176,7 @@ async def make_single_request(
228176 """
229177
230178 profile_config = self ._profile_manager .get_profile_by_name (request .model_profile )
231- max_output_tokens = self ._get_max_output_tokens (request )
179+ max_output_tokens = self ._get_max_output_token_request_limit (request )
232180
233181 generation_config : GenerateContentConfigDict = {
234182 "temperature" : request .temperature ,
@@ -355,3 +303,58 @@ async def make_single_request(
355303 finally :
356304 if file_obj :
357305 await delete_file (self ._client , file_obj .name )
306+
307+ @cache # noqa: B019 – builtin cache OK here.
308+ def _estimate_response_schema_tokens (self , response_format : ResolvedResponseFormat ) -> int :
309+ """Estimate token count for a response format schema.
310+
311+ Uses Google's tokenizer to count tokens in a JSON schema representation
312+ of the response format. Results are cached for performance.
313+
314+ Args:
315+ response_format: Pydantic model class defining the response format
316+
317+ Returns:
318+ Estimated token count for the response format
319+ """
320+ schema_str = response_format .schema_fingerprint
321+ return self ._token_counter .count_tokens (schema_str )
322+
323+ def _estimate_structured_output_overhead (self , response_format : ResolvedResponseFormat ) -> int :
324+ """Use Google-specific response schema token estimation.
325+
326+ Args:
327+ response_format: Pydantic model class defining the response format
328+
329+ Returns:
330+ Estimated token overhead for structured output
331+ """
332+ return self ._estimate_response_schema_tokens (response_format )
333+
334+ def _estimate_output_tokens (self , request : FenicCompletionsRequest ) -> int :
335+ """Estimate the number of output tokens for a request."""
336+ estimated_output_tokens = request .max_completion_tokens or 0
337+ if request .max_completion_tokens is None and request .messages .user_file :
338+ # TODO(DY): the semantic operator should dictate how the file affects the token estimate
339+ estimated_output_tokens = self .token_counter .count_file_output_tokens (request .messages )
340+ return estimated_output_tokens + self ._get_expected_additional_reasoning_tokens (request )
341+
342+ def _get_max_output_token_request_limit (self , request : FenicCompletionsRequest ) -> Optional [int ]:
343+ """Get the upper limit of output tokens for a request.
344+
345+ If max_completion_tokens is not set, don't apply a limit and return None.
346+
347+ Include the thinking token budget with a safety margin."""
348+ max_output_tokens = request .max_completion_tokens or 0
349+ if request .max_completion_tokens is None and request .messages .user_file :
350+ # Guardrail to ensure the model uses a sane amount of output tokens.
351+ # TODO(DY): the semantic operator should dictate how the file affects the token estimate
352+ max_output_tokens = self .token_counter .count_file_output_tokens (request .messages ) * 2
353+ return max_output_tokens + self ._get_expected_additional_reasoning_tokens (request )
354+
355+ def _get_expected_additional_reasoning_tokens (self , request : FenicCompletionsRequest ) -> int :
356+ """Get the expected additional reasoning tokens for a request. Include a safety margin."""
357+ profile_config = self ._profile_manager .get_profile_by_name (request .model_profile )
358+ return int (
359+ 1.5 * profile_config .thinking_token_budget
360+ )
0 commit comments