From 7cba9a4c396007eb9f181b499305488dba629d08 Mon Sep 17 00:00:00 2001 From: prabhath004 Date: Wed, 5 Nov 2025 18:26:49 -0500 Subject: [PATCH 1/2] feat: add max_file_size support to AzureBlob source Add optional max_file_size parameter to filter files by size in both list() and get_value() APIs. Files exceeding the limit are treated as non-existent. Closes #1251 --- docs/docs/sources/azureblob.md | 8 +++-- .../sources/_engine_builtin_specs.py | 1 + src/ops/sources/azure_blob.rs | 30 +++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/docs/sources/azureblob.md b/docs/docs/sources/azureblob.md index f9966f6a..e9a18d36 100644 --- a/docs/docs/sources/azureblob.md +++ b/docs/docs/sources/azureblob.md @@ -63,8 +63,6 @@ The spec takes the following fields: * `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["*.tmp", "**/*.log"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded. -* `sas_token` (`cocoindex.TransientAuthEntryReference[str]`, optional): a SAS token for authentication. -* `account_access_key` (`cocoindex.TransientAuthEntryReference[str]`, optional): an account access key for authentication. :::info @@ -72,6 +70,12 @@ The spec takes the following fields: ::: +* `max_file_size` (`int`, optional): if provided, files exceeding this size in bytes will be treated as non-existent and skipped during processing. + This is useful to avoid processing large files that are not relevant to your use case, such as videos or backups. + If not specified, no size limit is applied. +* `sas_token` (`cocoindex.TransientAuthEntryReference[str]`, optional): a SAS token for authentication. +* `account_access_key` (`cocoindex.TransientAuthEntryReference[str]`, optional): an account access key for authentication. + ### Schema The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields: diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py index 6c90307b..d6154009 100644 --- a/python/cocoindex/sources/_engine_builtin_specs.py +++ b/python/cocoindex/sources/_engine_builtin_specs.py @@ -77,6 +77,7 @@ class AzureBlob(op.SourceSpec): binary: bool = False included_patterns: list[str] | None = None excluded_patterns: list[str] | None = None + max_file_size: int | None = None sas_token: TransientAuthEntryReference[str] | None = None account_access_key: TransientAuthEntryReference[str] | None = None diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs index 4f56b719..58941f82 100644 --- a/src/ops/sources/azure_blob.rs +++ b/src/ops/sources/azure_blob.rs @@ -19,6 +19,7 @@ pub struct Spec { binary: bool, included_patterns: Option>, excluded_patterns: Option>, + max_file_size: Option, /// SAS token for authentication. Takes precedence over account_access_key. sas_token: Option>, @@ -32,6 +33,7 @@ struct Executor { prefix: Option, binary: bool, pattern_matcher: PatternMatcher, + max_file_size: Option, } fn datetime_to_ordinal(dt: &time::OffsetDateTime) -> Ordinal { @@ -73,6 +75,15 @@ impl SourceExecutor for Executor { // Only include files (not directories) if key.ends_with('/') { continue; } + // Check file size limit + if let Some(max_size) = self.max_file_size { + if let Some(size) = blob.properties.content_length { + if size > max_size { + continue; + } + } + } + if self.pattern_matcher.is_file_included(key) { let ordinal = Some(datetime_to_ordinal(&blob.properties.last_modified)); batch.push(PartialSourceRow { @@ -115,6 +126,24 @@ impl SourceExecutor for Executor { }); } + // Check file size limit + if let Some(max_size) = self.max_file_size { + let blob_client = self + .client + .container_client(&self.container_name) + .blob_client(key_str.as_ref()); + let properties = blob_client.get_properties().await?; + if let Some(size) = properties.blob.properties.content_length { + if size > max_size { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + content_version_fp: None, + }); + } + } + } + let blob_client = self .client .container_client(&self.container_name) @@ -238,6 +267,7 @@ impl SourceFactoryBase for Factory { prefix: spec.prefix, binary: spec.binary, pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?, + max_file_size: spec.max_file_size, })) } } From ae9f4d7874a89b01fe56fe25a78304e936e9b504 Mon Sep 17 00:00:00 2001 From: prabhath004 Date: Fri, 7 Nov 2025 09:30:36 -0500 Subject: [PATCH 2/2] fix: correct type comparison for blob content_length Azure blob content_length is u64, not Option, so compare directly with max_size cast to u64 instead of unwrapping Option. --- src/ops/sources/azure_blob.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs index 58941f82..25a7fdb9 100644 --- a/src/ops/sources/azure_blob.rs +++ b/src/ops/sources/azure_blob.rs @@ -77,10 +77,8 @@ impl SourceExecutor for Executor { // Check file size limit if let Some(max_size) = self.max_file_size { - if let Some(size) = blob.properties.content_length { - if size > max_size { - continue; - } + if blob.properties.content_length > max_size as u64 { + continue; } } @@ -133,14 +131,12 @@ impl SourceExecutor for Executor { .container_client(&self.container_name) .blob_client(key_str.as_ref()); let properties = blob_client.get_properties().await?; - if let Some(size) = properties.blob.properties.content_length { - if size > max_size { - return Ok(PartialSourceRowData { - value: Some(SourceValue::NonExistence), - ordinal: Some(Ordinal::unavailable()), - content_version_fp: None, - }); - } + if properties.blob.properties.content_length > max_size as u64 { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + content_version_fp: None, + }); } }