Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 176 additions & 11 deletions src/tools/mongodb/create/insertMany.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js";
import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js";
import { zEJSON } from "../../args.js";
import { type Document } from "bson";
import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js";
import { ErrorCodes, MongoDBError } from "../../../common/errors.js";
import type { VectorFieldIndexDefinition } from "../../../common/search/vectorSearchEmbeddingsManager.js";

export class InsertManyTool extends MongoDBToolBase {
public name = "insert-many";
Expand All @@ -12,7 +16,12 @@
documents: z
.array(zEJSON().describe("An individual MongoDB document"))
.describe(
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()"
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). For fields that have vector search indexes, you can provide raw text strings that will be automatically converted to embeddings if embeddingParameters is provided."
),
embeddingParameters: zSupportedEmbeddingParameters
.optional()
.describe(
"The embedding model and its parameters to use to generate embeddings for fields that have vector search indexes. When a field has a vector search index and contains a plain text string in the document, embeddings will be automatically generated from that string value. Note to LLM: If unsure which embedding model to use, ask the user before providing one."
),
};
public operationType: OperationType = "create";
Expand All @@ -21,23 +30,34 @@
database,
collection,
documents,
embeddingParameters,
}: ToolArgs<typeof this.argsShape>): Promise<CallToolResult> {
const provider = await this.ensureConnected();

const embeddingValidations = new Set(
...(await Promise.all(
documents.flatMap((document) =>
this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings(
{ database, collection },
document
)
)
))
// Get vector search indexes for the collection
const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({
database,
collection,
});

// Process documents to replace raw string values with generated embeddings
documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({
database,
collection,
documents,
vectorIndexes,
embeddingParameters,
});

const embeddingValidationPromises = documents.map((document) =>
this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings({ database, collection }, document)
);
const embeddingValidationResults = await Promise.all(embeddingValidationPromises);
const embeddingValidations = new Set(embeddingValidationResults.flat());

if (embeddingValidations.size > 0) {
// tell the LLM what happened
const embeddingValidationMessages = [...embeddingValidations].map(
const embeddingValidationMessages = Array.from(embeddingValidations).map(
(validation) =>
`- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` +
` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` +
Expand All @@ -63,4 +83,149 @@
content,
};
}

private async replaceRawValuesWithEmbeddingsIfNecessary({
database,
collection,
documents,
vectorIndexes,
embeddingParameters,
}: {
database: string;
collection: string;
documents: Document[];
vectorIndexes: VectorFieldIndexDefinition[];
embeddingParameters?: z.infer<typeof zSupportedEmbeddingParameters>;
}): Promise<Document[]> {
// If no vector indexes, return documents as-is
if (vectorIndexes.length === 0) {
return documents;
}

const processedDocuments: Document[] = [];

for (let i = 0; i < documents.length; i++) {
const document = documents[i];
if (!document) {
continue;
}
const processedDoc = await this.processDocumentForEmbeddings(
database,
collection,
document,
vectorIndexes,
embeddingParameters
);
processedDocuments.push(processedDoc);
}

return processedDocuments;
}

private async processDocumentForEmbeddings(
database: string,
collection: string,
document: Document,
vectorIndexes: VectorFieldIndexDefinition[],
embeddingParameters?: z.infer<typeof zSupportedEmbeddingParameters>
): Promise<Document> {
// Find all fields in the document that match vector search indexed fields and need embeddings
const fieldsNeedingEmbeddings: Array<{
path: string;
rawValue: string;
indexDef: VectorFieldIndexDefinition;
}> = [];

for (const indexDef of vectorIndexes) {
// Check if the field exists in the document and is a string (raw text)
const fieldValue = this.getFieldValue(document, indexDef.path);
if (typeof fieldValue === "string") {
fieldsNeedingEmbeddings.push({
path: indexDef.path,
rawValue: fieldValue,
indexDef,
});
}
}

// If no fields need embeddings, return document as-is
if (fieldsNeedingEmbeddings.length === 0) {
return document;
}

// Check if embeddingParameters is provided
if (!embeddingParameters) {
const fieldPaths = fieldsNeedingEmbeddings.map((f) => f.path).join(", ");
throw new MongoDBError(
ErrorCodes.AtlasVectorSearchInvalidQuery,
`Fields [${fieldPaths}] have vector search indexes and contain raw text strings. The embeddingParameters parameter is required to generate embeddings for these fields.`
);
}

// Generate embeddings for all fields
const embeddingsMap = new Map<string, number[]>();

for (const field of fieldsNeedingEmbeddings) {
const embeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({
database,
collection,
path: field.path,
rawValues: [field.rawValue],
embeddingParameters,
inputType: "document",
});

if (embeddings.length > 0 && Array.isArray(embeddings[0])) {
embeddingsMap.set(field.path, embeddings[0] as number[]);
}
}

// Replace raw string values with generated embeddings
const processedDoc = { ...document };

for (const field of fieldsNeedingEmbeddings) {
const embedding = embeddingsMap.get(field.path);
if (embedding) {
this.setFieldValue(processedDoc, field.path, embedding);
}
}

return processedDoc;
}

private getFieldValue(document: Document, path: string): unknown {
const parts = path.split(".");
let current: unknown = document;

for (const part of parts) {
if (current && typeof current === "object" && part in current) {
current = (current as Record<string, unknown>)[part];
} else {
return undefined;
}
}

return current;
}

private setFieldValue(document: Document, path: string, value: unknown): void {
const parts = path.split(".");
let current: Record<string, unknown> = document;

for (let i = 0; i < parts.length - 1; i++) {
const part = parts[i];
if (!part) {
continue;
}
if (!(part in current) || typeof current[part] !== "object") {
current[part] = {};
}
current = current[part] as Record<string, unknown>;
}

const lastPart = parts[parts.length - 1];
if (lastPart) {
current[lastPart] = value;

Check warning

Code scanning / CodeQL

Prototype-polluting function Medium

The property chain
here
is recursively assigned to
current
without guarding against prototype pollution.

Copilot Autofix

AI 1 day ago

To fix this vulnerability, we need to prevent untrusted property chains from traversing or assigning to dangerous property names such as "__proto__", "constructor", and "prototype". The single best way to accomplish this without affecting existing functionality is to skip any assignment where any element in the parts array matches these restricted names. Modify the loop in setFieldValue so that it continues (i.e., skips) upon encountering such property names in the chain, both when traversing intermediate objects and at the final assignment. This is a minimal, targeted fix. All changes occur in the setFieldValue method of InsertManyTool in src/tools/mongodb/create/insertMany.ts. No external libraries required.


Suggested changeset 1
src/tools/mongodb/create/insertMany.ts

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts
--- a/src/tools/mongodb/create/insertMany.ts
+++ b/src/tools/mongodb/create/insertMany.ts
@@ -211,10 +211,10 @@
     private setFieldValue(document: Document, path: string, value: unknown): void {
         const parts = path.split(".");
         let current: Record<string, unknown> = document;
-
+        const dangerousProperties = ["__proto__", "constructor", "prototype"];
         for (let i = 0; i < parts.length - 1; i++) {
             const part = parts[i];
-            if (!part) {
+            if (!part || dangerousProperties.includes(part)) {
                 continue;
             }
             if (!(part in current) || typeof current[part] !== "object") {
@@ -224,7 +222,7 @@
         }
 
         const lastPart = parts[parts.length - 1];
-        if (lastPart) {
+        if (lastPart && !dangerousProperties.includes(lastPart)) {
             current[lastPart] = value;
         }
     }
EOF
@@ -211,10 +211,10 @@
private setFieldValue(document: Document, path: string, value: unknown): void {
const parts = path.split(".");
let current: Record<string, unknown> = document;

const dangerousProperties = ["__proto__", "constructor", "prototype"];
for (let i = 0; i < parts.length - 1; i++) {
const part = parts[i];
if (!part) {
if (!part || dangerousProperties.includes(part)) {
continue;
}
if (!(part in current) || typeof current[part] !== "object") {
@@ -224,7 +222,7 @@
}

const lastPart = parts[parts.length - 1];
if (lastPart) {
if (lastPart && !dangerousProperties.includes(lastPart)) {
current[lastPart] = value;
}
}
Copilot is powered by AI and may make mistakes. Always verify output.
}
}
}
46 changes: 46 additions & 0 deletions tests/accuracy/insertMany.embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
import { Matcher } from "./sdk/matcher.js";

const embeddingParameters = {
model: "voyage-3",
outputDimension: Matcher.anyOf(
Matcher.undefined,
Matcher.number((n) => n === 1024)
),
outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
};

/**
* Accuracy tests for inserting documents with automatic vector embeddings generation.
* Tests scenarios where raw text strings are provided and automatically converted to embeddings.
*/
describeAccuracyTests(
[
{
prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality'. Use the plot summary to generate the 'plotEmbeddings' field using the voyage-3 model.",
expectedToolCalls: [
{
toolName: "insert-many",
parameters: {
database: "mflix",
collection: "movies",
documents: [
{
title: "The Matrix",
plotSummary: "A computer hacker learns about the true nature of his reality",
plotEmbeddings: "A computer hacker learns about the true nature of his reality",
},
],
embeddingParameters,
},
},
],
},
],
{
userConfig: { voyageApiKey: "valid-key" },
clusterConfig: {
search: true,
},
}
);
Loading