diff --git a/docs/docs/sources/azureblob.md b/docs/docs/sources/azureblob.md index f9966f6a..e9a18d36 100644 --- a/docs/docs/sources/azureblob.md +++ b/docs/docs/sources/azureblob.md @@ -63,8 +63,6 @@ The spec takes the following fields: * `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["*.tmp", "**/*.log"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded. -* `sas_token` (`cocoindex.TransientAuthEntryReference[str]`, optional): a SAS token for authentication. -* `account_access_key` (`cocoindex.TransientAuthEntryReference[str]`, optional): an account access key for authentication. :::info @@ -72,6 +70,12 @@ The spec takes the following fields: ::: +* `max_file_size` (`int`, optional): if provided, files exceeding this size in bytes will be treated as non-existent and skipped during processing. + This is useful to avoid processing large files that are not relevant to your use case, such as videos or backups. + If not specified, no size limit is applied. +* `sas_token` (`cocoindex.TransientAuthEntryReference[str]`, optional): a SAS token for authentication. +* `account_access_key` (`cocoindex.TransientAuthEntryReference[str]`, optional): an account access key for authentication. + ### Schema The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields: diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py index 8a57d8c7..473ac842 100644 --- a/python/cocoindex/sources/_engine_builtin_specs.py +++ b/python/cocoindex/sources/_engine_builtin_specs.py @@ -90,6 +90,7 @@ class AzureBlob(op.SourceSpec): binary: bool = False included_patterns: list[str] | None = None excluded_patterns: list[str] | None = None + max_file_size: int | None = None sas_token: TransientAuthEntryReference[str] | None = None account_access_key: TransientAuthEntryReference[str] | None = None diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs index 4f56b719..25a7fdb9 100644 --- a/src/ops/sources/azure_blob.rs +++ b/src/ops/sources/azure_blob.rs @@ -19,6 +19,7 @@ pub struct Spec { binary: bool, included_patterns: Option>, excluded_patterns: Option>, + max_file_size: Option, /// SAS token for authentication. Takes precedence over account_access_key. sas_token: Option>, @@ -32,6 +33,7 @@ struct Executor { prefix: Option, binary: bool, pattern_matcher: PatternMatcher, + max_file_size: Option, } fn datetime_to_ordinal(dt: &time::OffsetDateTime) -> Ordinal { @@ -73,6 +75,13 @@ impl SourceExecutor for Executor { // Only include files (not directories) if key.ends_with('/') { continue; } + // Check file size limit + if let Some(max_size) = self.max_file_size { + if blob.properties.content_length > max_size as u64 { + continue; + } + } + if self.pattern_matcher.is_file_included(key) { let ordinal = Some(datetime_to_ordinal(&blob.properties.last_modified)); batch.push(PartialSourceRow { @@ -115,6 +124,22 @@ impl SourceExecutor for Executor { }); } + // Check file size limit + if let Some(max_size) = self.max_file_size { + let blob_client = self + .client + .container_client(&self.container_name) + .blob_client(key_str.as_ref()); + let properties = blob_client.get_properties().await?; + if properties.blob.properties.content_length > max_size as u64 { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + content_version_fp: None, + }); + } + } + let blob_client = self .client .container_client(&self.container_name) @@ -238,6 +263,7 @@ impl SourceFactoryBase for Factory { prefix: spec.prefix, binary: spec.binary, pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?, + max_file_size: spec.max_file_size, })) } }