Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/docs/sources/googledrive.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ The spec takes the following fields:
* `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. If not specified, all files will be included.
* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded.
* `max_file_size` (`int`, optional): when set, any source file exceeding the limit (in bytes) will be ignored.

:::info

Expand Down
1 change: 1 addition & 0 deletions python/cocoindex/sources/_engine_builtin_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class GoogleDrive(op.SourceSpec):
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
excluded_patterns: list[str] | None = None

max_file_size: int | None = None
recent_changes_poll_interval: datetime.timedelta | None = None


Expand Down
23 changes: 21 additions & 2 deletions src/ops/sources/google_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub struct Spec {
recent_changes_poll_interval: Option<std::time::Duration>,
included_patterns: Option<Vec<String>>,
excluded_patterns: Option<Vec<String>>,
max_file_size: Option<i64>,
}

struct Executor {
Expand All @@ -70,6 +71,7 @@ struct Executor {
root_folder_ids: IndexSet<Arc<str>>,
recent_updates_poll_interval: Option<std::time::Duration>,
pattern_matcher: PatternMatcher,
max_file_size: Option<i64>,
}

impl Executor {
Expand Down Expand Up @@ -97,6 +99,7 @@ impl Executor {
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
recent_updates_poll_interval: spec.recent_changes_poll_interval,
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
max_file_size: spec.max_file_size,
})
}
}
Expand Down Expand Up @@ -303,7 +306,7 @@ impl SourceExecutor for Executor {
let mut seen_ids = HashSet::new();
let mut folder_ids = self.root_folder_ids.clone();
let fields = format!(
"files(id,name,mimeType,trashed{})",
"files(id,name,mimeType,trashed,size{})",
optional_modified_time(options.include_ordinal)
);
let mut new_folder_ids = Vec::new();
Expand All @@ -319,6 +322,12 @@ impl SourceExecutor for Executor {
if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){
continue
}
if let Some(max_size) = self.max_file_size
&& let Some(file_size) = file.size
&& file_size > max_size {
// Skip files over the specified limit
continue;
}
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
}
if !curr_rows.is_empty() {
Expand All @@ -342,7 +351,7 @@ impl SourceExecutor for Executor {
) -> Result<PartialSourceRowData> {
let file_id = key.single_part()?.str_value()?;
let fields = format!(
"id,name,mimeType,trashed{}",
"id,name,mimeType,trashed,size{}",
optional_modified_time(options.include_ordinal)
);
let resp = self
Expand Down Expand Up @@ -375,6 +384,16 @@ impl SourceExecutor for Executor {
content_version_fp: None,
});
}
if let Some(max_size) = self.max_file_size
&& let Some(file_size) = file.size
&& file_size > max_size
{
return Ok(PartialSourceRowData {
value: Some(SourceValue::NonExistence),
ordinal: Some(Ordinal::unavailable()),
content_version_fp: None,
});
}
let ordinal = if options.include_ordinal {
file.modified_time.map(|t| t.try_into()).transpose()?
} else {
Expand Down
Loading