Skip to content

Commit 3bf73da

Browse files
committed
chore: simplify
1 parent 64ac05e commit 3bf73da

File tree

2 files changed

+222
-11
lines changed

2 files changed

+222
-11
lines changed

src/tools/mongodb/create/insertMany.ts

Lines changed: 176 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
33
import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js";
44
import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js";
55
import { zEJSON } from "../../args.js";
6+
import { type Document } from "bson";
7+
import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js";
8+
import { ErrorCodes, MongoDBError } from "../../../common/errors.js";
9+
import type { VectorFieldIndexDefinition } from "../../../common/search/vectorSearchEmbeddingsManager.js";
610

711
export class InsertManyTool extends MongoDBToolBase {
812
public name = "insert-many";
@@ -12,7 +16,12 @@ export class InsertManyTool extends MongoDBToolBase {
1216
documents: z
1317
.array(zEJSON().describe("An individual MongoDB document"))
1418
.describe(
15-
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()"
19+
"The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). For fields that have vector search indexes, you can provide raw text strings that will be automatically converted to embeddings if embeddingParameters is provided."
20+
),
21+
embeddingParameters: zSupportedEmbeddingParameters
22+
.optional()
23+
.describe(
24+
"The embedding model and its parameters to use to generate embeddings for fields that have vector search indexes. When a field has a vector search index and contains a plain text string in the document, embeddings will be automatically generated from that string value. Note to LLM: If unsure which embedding model to use, ask the user before providing one."
1625
),
1726
};
1827
public operationType: OperationType = "create";
@@ -21,23 +30,34 @@ export class InsertManyTool extends MongoDBToolBase {
2130
database,
2231
collection,
2332
documents,
33+
embeddingParameters,
2434
}: ToolArgs<typeof this.argsShape>): Promise<CallToolResult> {
2535
const provider = await this.ensureConnected();
2636

27-
const embeddingValidations = new Set(
28-
...(await Promise.all(
29-
documents.flatMap((document) =>
30-
this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings(
31-
{ database, collection },
32-
document
33-
)
34-
)
35-
))
37+
// Get vector search indexes for the collection
38+
const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({
39+
database,
40+
collection,
41+
});
42+
43+
// Process documents to replace raw string values with generated embeddings
44+
documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({
45+
database,
46+
collection,
47+
documents,
48+
vectorIndexes,
49+
embeddingParameters,
50+
});
51+
52+
const embeddingValidationPromises = documents.map((document) =>
53+
this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings({ database, collection }, document)
3654
);
55+
const embeddingValidationResults = await Promise.all(embeddingValidationPromises);
56+
const embeddingValidations = new Set(embeddingValidationResults.flat());
3757

3858
if (embeddingValidations.size > 0) {
3959
// tell the LLM what happened
40-
const embeddingValidationMessages = [...embeddingValidations].map(
60+
const embeddingValidationMessages = Array.from(embeddingValidations).map(
4161
(validation) =>
4262
`- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` +
4363
` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` +
@@ -63,4 +83,149 @@ export class InsertManyTool extends MongoDBToolBase {
6383
content,
6484
};
6585
}
86+
87+
private async replaceRawValuesWithEmbeddingsIfNecessary({
88+
database,
89+
collection,
90+
documents,
91+
vectorIndexes,
92+
embeddingParameters,
93+
}: {
94+
database: string;
95+
collection: string;
96+
documents: Document[];
97+
vectorIndexes: VectorFieldIndexDefinition[];
98+
embeddingParameters?: z.infer<typeof zSupportedEmbeddingParameters>;
99+
}): Promise<Document[]> {
100+
// If no vector indexes, return documents as-is
101+
if (vectorIndexes.length === 0) {
102+
return documents;
103+
}
104+
105+
const processedDocuments: Document[] = [];
106+
107+
for (let i = 0; i < documents.length; i++) {
108+
const document = documents[i];
109+
if (!document) {
110+
continue;
111+
}
112+
const processedDoc = await this.processDocumentForEmbeddings(
113+
database,
114+
collection,
115+
document,
116+
vectorIndexes,
117+
embeddingParameters
118+
);
119+
processedDocuments.push(processedDoc);
120+
}
121+
122+
return processedDocuments;
123+
}
124+
125+
private async processDocumentForEmbeddings(
126+
database: string,
127+
collection: string,
128+
document: Document,
129+
vectorIndexes: VectorFieldIndexDefinition[],
130+
embeddingParameters?: z.infer<typeof zSupportedEmbeddingParameters>
131+
): Promise<Document> {
132+
// Find all fields in the document that match vector search indexed fields and need embeddings
133+
const fieldsNeedingEmbeddings: Array<{
134+
path: string;
135+
rawValue: string;
136+
indexDef: VectorFieldIndexDefinition;
137+
}> = [];
138+
139+
for (const indexDef of vectorIndexes) {
140+
// Check if the field exists in the document and is a string (raw text)
141+
const fieldValue = this.getFieldValue(document, indexDef.path);
142+
if (typeof fieldValue === "string") {
143+
fieldsNeedingEmbeddings.push({
144+
path: indexDef.path,
145+
rawValue: fieldValue,
146+
indexDef,
147+
});
148+
}
149+
}
150+
151+
// If no fields need embeddings, return document as-is
152+
if (fieldsNeedingEmbeddings.length === 0) {
153+
return document;
154+
}
155+
156+
// Check if embeddingParameters is provided
157+
if (!embeddingParameters) {
158+
const fieldPaths = fieldsNeedingEmbeddings.map((f) => f.path).join(", ");
159+
throw new MongoDBError(
160+
ErrorCodes.AtlasVectorSearchInvalidQuery,
161+
`Fields [${fieldPaths}] have vector search indexes and contain raw text strings. The embeddingParameters parameter is required to generate embeddings for these fields.`
162+
);
163+
}
164+
165+
// Generate embeddings for all fields
166+
const embeddingsMap = new Map<string, number[]>();
167+
168+
for (const field of fieldsNeedingEmbeddings) {
169+
const embeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({
170+
database,
171+
collection,
172+
path: field.path,
173+
rawValues: [field.rawValue],
174+
embeddingParameters,
175+
inputType: "document",
176+
});
177+
178+
if (embeddings.length > 0 && Array.isArray(embeddings[0])) {
179+
embeddingsMap.set(field.path, embeddings[0] as number[]);
180+
}
181+
}
182+
183+
// Replace raw string values with generated embeddings
184+
const processedDoc = { ...document };
185+
186+
for (const field of fieldsNeedingEmbeddings) {
187+
const embedding = embeddingsMap.get(field.path);
188+
if (embedding) {
189+
this.setFieldValue(processedDoc, field.path, embedding);
190+
}
191+
}
192+
193+
return processedDoc;
194+
}
195+
196+
private getFieldValue(document: Document, path: string): unknown {
197+
const parts = path.split(".");
198+
let current: unknown = document;
199+
200+
for (const part of parts) {
201+
if (current && typeof current === "object" && part in current) {
202+
current = (current as Record<string, unknown>)[part];
203+
} else {
204+
return undefined;
205+
}
206+
}
207+
208+
return current;
209+
}
210+
211+
private setFieldValue(document: Document, path: string, value: unknown): void {
212+
const parts = path.split(".");
213+
let current: Record<string, unknown> = document;
214+
215+
for (let i = 0; i < parts.length - 1; i++) {
216+
const part = parts[i];
217+
if (!part) {
218+
continue;
219+
}
220+
if (!(part in current) || typeof current[part] !== "object") {
221+
current[part] = {};
222+
}
223+
current = current[part] as Record<string, unknown>;
224+
}
225+
226+
const lastPart = parts[parts.length - 1];
227+
if (lastPart) {
228+
current[lastPart] = value;
229+
}
230+
}
66231
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { Matcher } from "./sdk/matcher.js";
3+
4+
const embeddingParameters = {
5+
model: "voyage-3",
6+
outputDimension: Matcher.anyOf(
7+
Matcher.undefined,
8+
Matcher.number((n) => n === 1024)
9+
),
10+
outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
11+
};
12+
13+
/**
14+
* Accuracy tests for inserting documents with automatic vector embeddings generation.
15+
* Tests scenarios where raw text strings are provided and automatically converted to embeddings.
16+
*/
17+
describeAccuracyTests(
18+
[
19+
{
20+
prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality'. Use the plot summary to generate the 'plotEmbeddings' field using the voyage-3 model.",
21+
expectedToolCalls: [
22+
{
23+
toolName: "insert-many",
24+
parameters: {
25+
database: "mflix",
26+
collection: "movies",
27+
documents: [
28+
{
29+
title: "The Matrix",
30+
plotSummary: "A computer hacker learns about the true nature of his reality",
31+
plotEmbeddings: "A computer hacker learns about the true nature of his reality",
32+
},
33+
],
34+
embeddingParameters,
35+
},
36+
},
37+
],
38+
},
39+
],
40+
{
41+
userConfig: { voyageApiKey: "valid-key" },
42+
clusterConfig: {
43+
search: true,
44+
},
45+
}
46+
);

0 commit comments

Comments
 (0)