Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions src/common/search/embeddingsProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,33 @@ export const zVoyageModels = z
.enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"])
.default("voyage-3-large");

// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
// so we preprocess them to unwrap them so Zod understands them.
function unboxNumber(v: unknown): number {
if (v && typeof v === "object" && typeof v.valueOf === "function") {
const n = Number(v.valueOf());
if (!Number.isNaN(n)) return n;
}
return v as number;
}

export const zVoyageEmbeddingParameters = z.object({
outputDimension: z
.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
.preprocess(
unboxNumber,
z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
)
Comment on lines +42 to +45
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it SDK that is doing the boxing of Number to Int32, etc? wondering if that is something we should be doing everywhere else as well.

.optional()
.default(1024),
outputDType: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
});

const zVoyageAPIParameters = zVoyageEmbeddingParameters
.extend({
inputType: z.enum(["query", "document"]),
})
.strip();

type VoyageModels = z.infer<typeof zVoyageModels>;
type VoyageEmbeddingParameters = z.infer<typeof zVoyageEmbeddingParameters> & EmbeddingParameters;

Expand Down Expand Up @@ -62,11 +81,15 @@ class VoyageEmbeddingsProvider implements EmbeddingsProvider<VoyageModels, Voyag
content: EmbeddingsInput[],
parameters: VoyageEmbeddingParameters
): Promise<Embeddings[]> {
// This ensures that if we receive any random parameter from the outside (agent or us)
// it's stripped before sending it to Voyage, as Voyage will reject the request on
// a single unknown parameter.
const voyage = zVoyageAPIParameters.parse(parameters);
const model = this.voyage.textEmbeddingModel(modelId);
const { embeddings } = await embedMany({
model,
values: content,
providerOptions: { voyage: parameters },
providerOptions: { voyage },
});

return embeddings;
Expand Down
22 changes: 16 additions & 6 deletions src/tools/mongodb/read/aggregate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ const VectorSearchStage = z.object({
filter: zEJSON()
.optional()
.describe(
"MQL filter that can only use pre-filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for pre-filtering."
"MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering."
),
embeddingParameters: zSupportedEmbeddingParameters
.optional()
Expand All @@ -59,11 +59,21 @@ const VectorSearchStage = z.object({
});

export const AggregateArgs = {
pipeline: z
.array(z.union([AnyStage, VectorSearchStage]))
.describe(
"An array of aggregation stages to execute. $vectorSearch can only appear as the first stage of the aggregation pipeline or as the first stage of a $unionWith subpipeline. When using $vectorSearch, unless the user explicitly asks for the embeddings, $unset any embedding field to avoid reaching context limits."
),
pipeline: z.array(z.union([AnyStage, VectorSearchStage])).describe(
`An array of aggregation stages to execute.
\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless proven otherwise by accuracy tests I fear this sounding more like $vectorSearch always has to be the first stage regardless of the pipeline needing one or not.

Would it make sense to frame it like -
If $vectorSearch is to be used it **MUST** ... rest of the content

### Usage Rules for \`$vectorSearch\`
- **Unset embeddings:**
Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
- **Pre-filtering:**
If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
NEVER include fields in $vectorSearch.filter that are not part of the vector index.
- **Post-filtering:**
For all remaining filters, add a $match stage after $vectorSearch.
### Note to LLM
- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.`
),
responseBytesLimit: z.number().optional().default(ONE_MB).describe(`\
The maximum number of bytes to return in the response. This value is capped by the server's configured maxBytesPerQuery and cannot be exceeded. \
Note to LLM: If the entire aggregation result is required, use the "export" tool instead of increasing this limit.\
Expand Down
255 changes: 231 additions & 24 deletions tests/accuracy/aggregate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
import { Matcher } from "./sdk/matcher.js";
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";

function doesUnset(field: string): Matcher {
return Matcher.anyOf(
// { $unset: "<field>" } || { $unset: ["<field>"] }
Matcher.value({ $unset: Matcher.arrayOrSingle(Matcher.value(field)) }),
// { $unset: { "<field>": "" } }
Matcher.value({ $unset: { [field]: "" } })
);
}

const embeddingParameters = {
model: "voyage-3-large",
outputDimension: Matcher.anyOf(
Matcher.undefined,
Matcher.number((n) => n === 1024)
),
outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
};

describeAccuracyTests([
{
prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
Expand Down Expand Up @@ -48,16 +66,71 @@ describeAccuracyTests([
index: "titles",
path: "title_embeddings",
queryVector: "hammer of justice",
embeddingParameters: {
model: "voyage-3-large",
outputDimension: Matcher.anyOf(
Matcher.undefined,
Matcher.number((n) => n === 1024)
),
embeddingParameters,
filter: Matcher.emptyObjectOrUndefined,
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
doesUnset("title_embeddings"),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
],
mockedTools: {
"collection-indexes": (): CallToolResult => {
return {
content: [
{
type: "text",
text: JSON.stringify({
name: "titles",
type: "vectorSearch",
status: "READY",
queryable: true,
latestDefinition: {
type: "vector",
path: "title_embeddings",
numDimensions: 1024,
quantization: "none",
similarity: "euclidean",
},
}),
},
],
};
},
},
},
{
prompt: "Run a vectorSearch query on musicfy.songs on path 'title_embeddings' using the index 'titles' with the model voyage-3-large to find all 'hammer of justice' songs. Keep the embedding field, do not remove it.",
expectedToolCalls: [
{
toolName: "collection-indexes",
parameters: {
database: "musicfy",
collection: "songs",
},
optional: true,
},
{
toolName: "aggregate",
parameters: {
database: "musicfy",
collection: "songs",
pipeline: [
{
$vectorSearch: {
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
index: "titles",
path: "title_embeddings",
queryVector: "hammer of justice",
embeddingParameters,
filter: Matcher.emptyObjectOrUndefined,
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
Matcher.not(doesUnset("title_embeddings")),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
Expand Down Expand Up @@ -107,21 +180,16 @@ describeAccuracyTests([
pipeline: [
{
$vectorSearch: {
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(true)),
exact: true,
index: "titles",
path: "title_embeddings",
queryVector: "hammer of justice",
limit: 10,
embeddingParameters: {
model: "voyage-3-large",
outputDimension: Matcher.anyOf(
Matcher.undefined,
Matcher.number((n) => n === 1024)
),
},
embeddingParameters,
filter: Matcher.emptyObjectOrUndefined,
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
doesUnset("title_embeddings"),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
Expand Down Expand Up @@ -153,7 +221,7 @@ describeAccuracyTests([
},
},
{
prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fy' movies.",
prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies.",
expectedToolCalls: [
{
toolName: "collection-indexes",
Expand All @@ -173,17 +241,13 @@ describeAccuracyTests([
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
index: "my-index",
path: "plot_embeddings",
queryVector: "sci-fy",
embeddingParameters: {
model: "voyage-3-large",
outputDimension: Matcher.anyOf(
Matcher.undefined,
Matcher.number((n) => n === 1024)
),
},
queryVector: "sci-fi",
embeddingParameters,
filter: Matcher.emptyObjectOrUndefined,
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
doesUnset("plot_embeddings"),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
Expand Down Expand Up @@ -214,4 +278,147 @@ describeAccuracyTests([
},
},
},
{
prompt: "(Pre-filter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with the `released` after 1993 (included) and are published in catalan.",
expectedToolCalls: [
{
toolName: "collection-indexes",
parameters: {
database: "mflix",
collection: "movies",
},
},
{
toolName: "aggregate",
parameters: {
database: "mflix",
collection: "movies",
pipeline: [
{
$vectorSearch: {
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
index: "my-index",
path: "plot_embeddings",
queryVector: "sci-fi",
numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
embeddingParameters,
filter: {
released: { $gte: 1993 },
language: Matcher.caseInsensitiveString("catalan"),
},
},
},
doesUnset("plot_embeddings"),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
],
mockedTools: {
"collection-indexes": (): CallToolResult => {
return {
content: [
{
type: "text",
text: JSON.stringify({
name: "my-index",
type: "vectorSearch",
status: "READY",
queryable: true,
latestDefinition: {
fields: [
{
type: "vector",
path: "plot_embeddings",
numDimensions: 1024,
quantization: "none",
similarity: "euclidean",
},
{
type: "filter",
path: "language",
},
{
type: "filter",
path: "released",
},
],
},
}),
},
],
};
},
},
},
{
prompt: "(No-prefilter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with `released` after 1993 (included) and are published in catalan.",
expectedToolCalls: [
{
toolName: "collection-indexes",
parameters: {
database: "mflix",
collection: "movies",
},
},
{
toolName: "aggregate",
parameters: {
database: "mflix",
collection: "movies",
pipeline: [
{
$vectorSearch: {
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
index: "my-index",
path: "plot_embeddings",
queryVector: "sci-fi",
numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
embeddingParameters,
filter: Matcher.emptyObjectOrUndefined,
},
},
{
$match: {
released: { $gte: 1993 },
language: Matcher.caseInsensitiveString("catalan"),
},
},
doesUnset("plot_embeddings"),
],
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
},
},
],
mockedTools: {
"collection-indexes": (): CallToolResult => {
return {
content: [
{
type: "text",
text: JSON.stringify({
name: "my-index",
type: "vectorSearch",
status: "READY",
queryable: true,
latestDefinition: {
fields: [
{
type: "vector",
path: "plot_embeddings",
numDimensions: 1024,
quantization: "none",
similarity: "euclidean",
},
],
},
}),
},
],
};
},
},
},
]);
Loading
Loading