Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/fenic/_backends/local/semantic_operators/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,13 @@ def __init__(
page_separator: Optional[str] = None,
describe_images: bool = False,
model_alias: Optional[ResolvedModelAlias] = None,
max_output_tokens: Optional[int] = None,
):
self.page_separator = page_separator
self.describe_images = describe_images
self.model = model
self.model_alias = model_alias
self.max_output_tokens = max_output_tokens

DocFolderLoader.check_file_extensions(input.to_list(), "pdf")

Expand All @@ -62,7 +64,7 @@ def __init__(
model=model,
operator_name="semantic.parse_pdf",
inference_config=InferenceConfiguration(
max_output_tokens=None,
max_output_tokens=max_output_tokens,
temperature=1.0, # Use a higher temperature so gemini flash models can handle complex table formatting. For more info see the conversation here: https://discuss.ai.google.dev/t/gemini-2-0-flash-has-a-weird-bug/65119/26
model_profile=model_alias.profile if model_alias else None,
),
Expand Down
3 changes: 2 additions & 1 deletion src/fenic/_backends/local/transpiler/expr_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,9 @@ def parse_pdf_fn(batch: pl.Series) -> pl.Series:
page_separator=logical.page_separator,
describe_images=logical.describe_images,
model_alias=logical.model_alias,
max_output_tokens=logical.max_output_tokens,
).execute()

return self._convert_expr(logical.expr).map_batches(
parse_pdf_fn, return_dtype=pl.Utf8
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def _estimate_structured_output_overhead(self, response_format) -> int:
"""
return self.estimate_response_format_tokens(response_format)

def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
"""Get maximum output tokens including thinking budget.

Args:
Expand Down Expand Up @@ -329,7 +329,7 @@ def estimate_tokens_for_request(self, request: FenicCompletionsRequest):
input_tokens += self._count_auxiliary_input_tokens(request)

# Estimate output tokens
output_tokens = self._get_max_output_tokens(request)
output_tokens = self._get_max_output_token_request_limit(request)

return TokenEstimate(
input_tokens=input_tokens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def estimate_tokens_for_request(self, request: FenicEmbeddingsRequest) -> TokenE
output_tokens=0
)

def _get_max_output_tokens(self, request: FenicEmbeddingsRequest) -> int:
def _get_max_output_token_request_limit(self, request: FenicEmbeddingsRequest) -> int:
"""Get maximum output tokens (always 0 for embeddings).

Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,13 @@ async def make_single_request(
common_params: dict[str, Any] = {
"model": self._model,
"messages": convert_messages(request.messages),
"max_completion_tokens": request.max_completion_tokens + profile_configuration.expected_additional_reasoning_tokens,
"n": 1,
}

max_completion_tokens = self.get_max_output_token_request_limit(request, profile_configuration)
if max_completion_tokens is not None:
common_params["max_completion_tokens"] = max_completion_tokens

if request.temperature:
common_params.update({"temperature": request.temperature})

Expand Down Expand Up @@ -213,3 +217,13 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
A unique key for the request
"""
return generate_completion_request_key(request)

def get_max_output_token_request_limit(self, request: FenicCompletionsRequest, profile_config:OpenAICompletionProfileConfiguration) -> Optional[int]:
"""Return the maximum output token limit for a request.

Returns None if max_completion_tokens is not provided (no limit should be set).
If max_completion_tokens is provided, includes the thinking token budget with a safety margin.
"""
if request.max_completion_tokens is None:
return None
return request.max_completion_tokens + profile_config.expected_additional_reasoning_tokens
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def estimate_tokens_for_request(self, request: FenicEmbeddingsRequest) -> TokenE
input_tokens=self.token_counter.count_tokens(request.doc), output_tokens=0
)

def _get_max_output_tokens(self, request: FenicEmbeddingsRequest) -> int:
def _get_max_output_token_request_limit(self, request: FenicEmbeddingsRequest) -> int:
return 0

def reset_metrics(self):
Expand Down
114 changes: 57 additions & 57 deletions src/fenic/_inference/google/gemini_native_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,56 +132,6 @@ def count_tokens(self, messages: Tokenizable) -> int: # type: ignore[override]
# Re-expose for mypy – same implementation as parent.
return super().count_tokens(messages)

def _estimate_structured_output_overhead(self, response_format: ResolvedResponseFormat) -> int:
"""Use Google-specific response schema token estimation.

Args:
response_format: Pydantic model class defining the response format

Returns:
Estimated token overhead for structured output
"""
return self._estimate_response_schema_tokens(response_format)

def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> Optional[int]:
"""Get maximum output tokens including thinking budget.

If max_completion_tokens is not set, return None.

Conservative estimate that includes both completion tokens and
thinking token budget with a safety margin.

Args:
request: The completion request

Returns:
Maximum output tokens (completion + thinking budget with safety margin)
"""
if request.max_completion_tokens is None:
return None
profile_config = self._profile_manager.get_profile_by_name(
request.model_profile
)
return request.max_completion_tokens + int(
1.5 * profile_config.thinking_token_budget
)

@cache # noqa: B019 – builtin cache OK here.
def _estimate_response_schema_tokens(self, response_format: ResolvedResponseFormat) -> int:
"""Estimate token count for a response format schema.

Uses Google's tokenizer to count tokens in a JSON schema representation
of the response format. Results are cached for performance.

Args:
response_format: Pydantic model class defining the response format

Returns:
Estimated token count for the response format
"""
schema_str = response_format.schema_fingerprint
return self._token_counter.count_tokens(schema_str)

def get_request_key(self, request: FenicCompletionsRequest) -> str:
"""Generate a unique key for the request.

Expand All @@ -196,19 +146,17 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
def estimate_tokens_for_request(self, request: FenicCompletionsRequest):
"""Estimate the number of tokens for a request.

If the request provides a max_completion_tokens value, use that. Otherwise, estimate the output tokens based on the file size.

Args:
request: The request to estimate tokens for

Returns:
TokenEstimate: The estimated token usage
"""

# Count input tokens
input_tokens = self.count_tokens(request.messages)
input_tokens += self._count_auxiliary_input_tokens(request)

output_tokens = self._get_max_output_tokens(request) or self._model_parameters.max_output_tokens

output_tokens = self._estimate_output_tokens(request)
return TokenEstimate(input_tokens=input_tokens, output_tokens=output_tokens)

async def make_single_request(
Expand All @@ -228,16 +176,17 @@ async def make_single_request(
"""

profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
max_output_tokens = self._get_max_output_tokens(request)

generation_config: GenerateContentConfigDict = {
"temperature": request.temperature,
"response_logprobs": request.top_logprobs is not None,
"logprobs": request.top_logprobs,
"system_instruction": request.messages.system,
}

max_output_tokens = self._get_max_output_token_request_limit(request)
if max_output_tokens is not None:
generation_config["max_output_tokens"] = max_output_tokens

generation_config.update(profile_config.additional_generation_config)
if request.structured_output is not None:
generation_config.update(
Expand Down Expand Up @@ -355,3 +304,54 @@ async def make_single_request(
finally:
if file_obj:
await delete_file(self._client, file_obj.name)

@cache # noqa: B019 – builtin cache OK here.
def _estimate_response_schema_tokens(self, response_format: ResolvedResponseFormat) -> int:
"""Estimate token count for a response format schema.

Uses Google's tokenizer to count tokens in a JSON schema representation
of the response format. Results are cached for performance.

Args:
response_format: Pydantic model class defining the response format

Returns:
Estimated token count for the response format
"""
schema_str = response_format.schema_fingerprint
return self._token_counter.count_tokens(schema_str)

def _estimate_structured_output_overhead(self, response_format: ResolvedResponseFormat) -> int:
"""Use Google-specific response schema token estimation.

Args:
response_format: Pydantic model class defining the response format

Returns:
Estimated token overhead for structured output
"""
return self._estimate_response_schema_tokens(response_format)

def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
"""Estimate the number of output tokens for a request."""
estimated_output_tokens = request.max_completion_tokens or 0
if request.max_completion_tokens is None and request.messages.user_file:
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
estimated_output_tokens = self.token_counter.count_file_output_tokens(request.messages)
return estimated_output_tokens + self._get_expected_additional_reasoning_tokens(request)

def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
"""Get the upper limit of output tokens for a request.

Returns None if max_completion_tokens is not provided (no limit should be set).
If max_completion_tokens is provided, includes the thinking token budget with a safety margin."""
if request.max_completion_tokens is None:
return None
return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)

def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:
"""Get the expected additional reasoning tokens for a request. Include a safety margin."""
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
return int(
1.5 * profile_config.thinking_token_budget
)
2 changes: 1 addition & 1 deletion src/fenic/_inference/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

@dataclass
class InferenceConfiguration:
# If max_output_tokens is not provided, do not include it in the request.
# If max_output_tokens is not provided, model_client will add a guardrail based on the estimated output tokens.
max_output_tokens: Optional[int]
temperature: float
top_logprobs: Optional[int] = None
Expand Down
4 changes: 2 additions & 2 deletions src/fenic/_inference/model_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,8 @@ def _estimate_structured_output_overhead(self, response_format: ResolvedResponse


@abstractmethod
def _get_max_output_tokens(self, request: RequestT) -> int:
"""Get conservative output token estimate. Override in subclasses for provider-specific logic."""
def _get_max_output_token_request_limit(self, request: RequestT) -> int:
"""Get the upper limit of output tokens to set on a request."""
pass

#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
profile_configurations=profiles,
default_profile_name=default_profile_name,
)

self._core = OpenAIChatCompletionsCore(
model=model,
model_provider=ModelProvider.OPENAI,
Expand Down Expand Up @@ -108,7 +109,7 @@ def estimate_tokens_for_request(self, request: FenicCompletionsRequest) -> Token
"""
return TokenEstimate(
input_tokens=self.token_counter.count_tokens(request.messages),
output_tokens=self._get_max_output_tokens(request)
output_tokens=self._estimate_output_tokens(request)
)

def reset_metrics(self):
Expand All @@ -123,10 +124,21 @@ def get_metrics(self) -> LMMetrics:
"""
return self._core.get_metrics()

def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
"""Conservative estimate: max_completion_tokens + reasoning effort-based thinking tokens."""
base_tokens = request.max_completion_tokens

# Get profile-specific reasoning effort
def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
"""Estimate the number of output tokens for a request."""
base_tokens = request.max_completion_tokens or 0
if request.max_completion_tokens is None and request.messages.user_file:
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
return base_tokens + profile_config.expected_additional_reasoning_tokens

def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
"""Return the maximum output token limit for a request.

For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).

Include the thinking token budget with a safety margin.
"""
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
return self._core.get_max_output_token_request_limit(request, profile_config)
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get_metrics(self) -> RMMetrics:
"""
return self._core.get_metrics()

def _get_max_output_tokens(self, request: RequestT) -> int:
def _get_max_output_token_request_limit(self, request: RequestT) -> int:
return 0

async def validate_api_key(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ async def make_single_request(
common_params = {
"model": self.model,
"messages": convert_messages(request.messages),
"max_completion_tokens": self._get_max_output_tokens(request),
"max_completion_tokens": self._get_max_output_token_request_limit(request),
"n": 1,
}

Expand Down Expand Up @@ -239,7 +239,7 @@ def estimate_tokens_for_request(
) -> TokenEstimate:
return TokenEstimate(
input_tokens=self.token_counter.count_tokens(request.messages),
output_tokens=self._get_max_output_tokens(request),
output_tokens=self.token_counter.count_tokens(request.messages) + self._get_expected_additional_reasoning_tokens(request),
)

def reset_metrics(self):
Expand All @@ -248,7 +248,14 @@ def reset_metrics(self):
def get_metrics(self) -> LMMetrics:
return self._metrics

def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
"""Get the upper limit of output tokens for a request.

If max_completion_tokens is not set, don't apply a limit and return None.

Include the thinking token budget with a safety margin."""
if request.max_completion_tokens is None:
return None
return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)

# This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
Expand Down
1 change: 0 additions & 1 deletion src/fenic/_inference/rate_limit_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

logger = logging.getLogger(__name__)


@dataclass
class TokenEstimate:
input_tokens: int = 0
Expand Down
Loading