mongodb-js · lerouxb · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
@@ -71,6 +71,8 @@
     "@types/react": "^17.0.5",
     "@types/react-dom": "^17.0.10",
     "@types/sinon-chai": "^3.2.5",
+    "autoevals": "^0.0.130",
+    "braintrust": "^0.2.4",
     "chai": "^4.3.6",
     "depcheck": "^1.4.1",
     "mocha": "^10.2.0",

@@ -0,0 +1,161 @@
+/* eslint-disable no-console */
+import { createOpenAI } from '@ai-sdk/openai';
+import { streamText } from 'ai';
+import { init, Factuality as _Factuality } from 'autoevals';
+import { Eval } from 'braintrust';
+import type { EvalCase, EvalScorer } from 'braintrust';
+import { OpenAI } from 'openai';
+import { evalCases } from './eval-cases';
+import { fuzzyLinkMatch } from './fuzzylinkmatch';
+import { binaryNdcgAtK } from './binaryndcgatk';
+
+const client = new OpenAI({
+  baseURL: 'https://api.braintrust.dev/v1/proxy',
+  apiKey: process.env.BRAINTRUST_API_KEY,
+});
+
+init({ client });
+
+export type SimpleEvalCase = {
+  name?: string;
+  input: string;
+  expected: string;
+  expectedSources?: string[];
+};
+
+type Message = {
+  text: string;
+};
+type InputMessage = Message;
+type OutputMessage = Message & { sources: string[] };
+type ExpectedMessage = OutputMessage;
+
+type ConversationEvalCaseInput = {
+  messages: InputMessage[];
+};
+
+type ConversationEvalCaseExpected = {
+  messages: OutputMessage[];
+};
+
+type ConversationEvalCase = EvalCase<
+  ConversationEvalCaseInput,
+  ConversationEvalCaseExpected,
+  unknown
+> & {
+  name: string; // defaults to the prompt
+};
+
+type ConversationTaskOutput = {
+  // again this could also be an array of messages and each message could be an
+  // object for future-proofing. But we're probably just going to be taking the
+  // result from the chatbot as a block of text for test purposes
+  messages: ExpectedMessage[];
+};
+
+type ConversationEvalScorer = EvalScorer<
+  ConversationEvalCaseInput,
+  ConversationTaskOutput,
+  ConversationEvalCaseExpected
+>;
+
+function allText(messages: Message[]): string {
+  return messages.map((m) => m.text).join('\n');
+}
+
+function makeEvalCases(): ConversationEvalCase[] {
+  return evalCases.map((c) => {
+    return {
+      name: c.name ?? c.input,
+      input: {
+        messages: [{ text: c.input }],
+      },
+      expected: {
+        messages: [{ text: c.expected, sources: c.expectedSources || [] }],
+      },
+      metadata: {},
+    };
+  });
+}
+
+async function makeAssistantCall(
+  input: ConversationEvalCaseInput
+): Promise<ConversationTaskOutput> {
+  const openai = createOpenAI({
+    baseURL: 'https://knowledge.staging.corp.mongodb.com/api/v1',
+    apiKey: '',
+    headers: {
+      'User-Agent': 'mongodb-compass/x.x.x',
+    },
+  });
+  const prompt = allText(input.messages);
+
+  const result = streamText({
+    model: openai.responses('mongodb-chat-latest'),
+    temperature: 0,
+    prompt,
+  });
+
+  const chunks: string[] = [];
+
+  for await (const chunk of result.toUIMessageStream()) {
+    const t = ((chunk as any).delta as string) || '';
+    if (t) {
+      chunks.push(t);
+    }
+  }
+  const text = chunks.join('');
+
+  // TODO: something up with this type
+  const resolvedSources = (await result.sources) as { url: string }[];
+
+  const sources = resolvedSources
+    .map((source) => {
+      console.log(source);
+      return source.url;
+    })
+    .filter((url) => !!url);
+
+  return {
+    messages: [{ text, sources }],
+  };
+}
+
+const Factuality: ConversationEvalScorer = ({ input, output, expected }) => {
+  return _Factuality({
+    input: allText(input.messages),
+    output: allText(output.messages),
+    expected: allText(expected.messages),
+    model: 'gpt-4.1',
+    temperature: 0,
+  });
+};
+
+const BinaryNdcgAt5: ConversationEvalScorer = ({ output, expected }) => {
+  const name = 'BinaryNdcgAt5';
+  const k = 5;
+  const outputLinks = output.messages[0].sources ?? [];
+  const expectedLinks = expected.messages[0].sources;
+  if (expectedLinks) {
+    return {
+      name,
+      score: binaryNdcgAtK(expectedLinks, outputLinks, fuzzyLinkMatch, k),
+    };
+  } else {
+    // If there are no expected links, return null
+    return {
+      name,
+      score: null,
+    };
+  }
+};
+
+void Eval<
+  ConversationEvalCaseInput,
+  ConversationTaskOutput,
+  ConversationEvalCaseExpected
+>('Compass Assistant', {
+  data: makeEvalCases,
+  task: makeAssistantCall,
+  scores: [Factuality, BinaryNdcgAt5],
+});
@@ -0,0 +1,93 @@
+import { strict as assert } from 'assert';
+
+type MatchFunc<T> = (expected: T, actual: T) => boolean;
+
+type Primitive = string | number | boolean | null | undefined;
+
+const assertKIsValid = (k: number) =>
+  assert(k > 0 && Number.isInteger(k), 'k must be a positive integer');
+
+/**
+  Taken from https://github.com/mongodb/chatbot/blob/004a61464c2c25d6b61ad943d1ad9b2fc934eb73/packages/mongodb-rag-core/src/eval/retrievalMetrics/binaryNdcgAtK.ts#L17
+
+  Calculate binary Normalized Discounted Cumulative Gain (NDCG) at rank K.
+  NDCG is a measure of ranking quality that evaluates how well the retrieved
+  results are ordered by relevance, considering the position of each result.
+  For binary relevance (relevant or not relevant), relevance scores are 1 or 0.
+
+  @param relevantItems - List of expected relevant items (all with relevance score 1).
+  @param retrievedItems - List of retrieved items to evaluate.
+  @param matchFunc - Function to compare items for equality.
+  @param k - Cutoff rank (top-k results to consider).
+  @returns Binary NDCG at rank K.
+ */
+export function binaryNdcgAtK<T extends Primitive>(
+  relevantItems: T[],
+  retrievedItems: T[],
+  matchFunc: MatchFunc<T>,
+  k: number
+): number {
+  assertKIsValid(k);
+
+  const limit = Math.min(k, retrievedItems.length);
+
+  const deduplicatedRetrievedItems = removeDuplicates(retrievedItems, limit);
+
+  const relevanceScores = calculateRelevanceScores(
+    deduplicatedRetrievedItems,
+    relevantItems,
+    matchFunc
+  );
+
+  // Use the ndcg function to calculate NDCG
+  return ndcg(relevanceScores, relevantItems.length, k);
+}
+
+function removeDuplicates<T extends Primitive>(
+  items: T[],
+  limit: number
+): (T | null)[] {
+  const itemsInLimit = items.slice(0, limit);
+  const seen = new Set<T>();
+  return itemsInLimit.map((item) => {
+    if (seen.has(item)) {
+      return null;
+    } else {
+      seen.add(item);
+      return item;
+    }
+  });
+}
+
+function calculateRelevanceScores<T extends Primitive>(
+  retrievedItems: (T | null)[],
+  relevantItems: T[],
+  matchFunc: MatchFunc<T>
+): number[] {
+  return retrievedItems.map((item) => {
+    // handle duplicate items
+    if (item === null) {
+      return 0;
+    }
+    return relevantItems.some((relevantItem) => matchFunc(relevantItem, item))
+      ? 1
+      : 0;
+  });
+}
+
+/**
+  Normalized Discounted Cumulative Gain (NDCG)
+ */
+export function ndcg(realScores: number[], idealNum: number, k: number) {
+  const actualDcg = dcg(realScores);
+  const idealDcg = dcg(ideal(idealNum, k));
+  return idealDcg === 0 ? 0 : actualDcg / idealDcg;
+}
+
+function dcg(scores: number[]) {
+  return scores.reduce((sum, gain, i) => sum + gain / Math.log2(i + 2), 0);
+}
+
+function ideal(n: number, k: number) {
+  return Array.from({ length: k }, (_, i) => (i < n ? 1 : 0));
+}
@@ -0,0 +1,39 @@
+import type { SimpleEvalCase } from '../assistant.eval';
+
+const evalCase: SimpleEvalCase = {
+  input: 'What is an aggregation pipeline?',
+  expected: `The aggregation pipeline in MongoDB is a framework for data processing and transformation. It consists of a sequence of stages, where each stage performs an operation on the input documents and passes the results to the next stage. Common operations include filtering, grouping, projecting, joining, and calculating values. Aggregation pipelines are powerful for data analysis, reporting, and transformation tasks in MongoDB.
+
+Compass makes it easy to create and run aggregation pipelines under the Aggregations tab. You may generate an aggregation pipeline with natural language, utilize the visual stage editor, or edit aggregations in the text view. 
+
+Example aggregation pipeline: 
+db.orders.aggregate([
+   // Stage 1: Unwind the array of products
+   { $unwind: { path: "$products" } },
+
+   // Stage 2: Match products that cost more than $15
+   { $match: { "products.price": { $gt: 15 } } },
+
+   // Stage 3: Group products by product ID
+   { $group: {
+         _id: "$products.prod_id",
+         product: { $first: "$products.name" },
+         total_value: { $sum: "$products.price" },
+         quantity: { $sum: 1 }
+      }
+   },
+
+   // Stage 4: Add a product_id field
+   { $set: { product_id: "$_id" } },
+
+   // Stage 5: Remove the _id field
+   { $unset: ["_id"] }
+])
+`,
+  expectedSources: [
+    'https://www.mongodb.com/docs/manual/core/aggregation-pipeline/',
+    'https://www.mongodb.com/docs/compass/create-agg-pipeline/',
+  ],
+};
+
+export default evalCase;
@@ -0,0 +1,13 @@
+import type { SimpleEvalCase } from '../assistant.eval';
+
+const evalCase: SimpleEvalCase = {
+  input: 'How can I filter docs before running a $search query?',
+  expected:
+    'Because the $search stage must be the first stage in an aggregation pipeline, you cannot pre-filter documents with a preceding $match stage. Instead, filtering should be performed within the $search stage using the filter clause of the compound operator. This allows you to apply predicate queries (e.g., on ranges, dates, or specific terms) to narrow down the dataset before the main query clauses (must or should) are executed. Alternatively, you can filter documents by creating a View—a partial index of your collection that pre-queries and filters out unwanted documents. Note that users need createCollection privileges to build views.',
+  expectedSources: [
+    'https://www.mongodb.com/docs/atlas/atlas-search/compound/#options',
+    'https://www.mongodb.com/docs/atlas/atlas-search/transform-documents-collections/#example--filter-documents',
+  ],
+};
+
+export default evalCase;
@@ -0,0 +1,10 @@
+import type { SimpleEvalCase } from '../assistant.eval';
+import filterDocsBeforeSearch from './filter-docs-before-search';
+import aggregationPipeline from './aggregation-pipeline';
+import modelData from './model-data';
+
+export const evalCases: SimpleEvalCase[] = [
+  filterDocsBeforeSearch,
+  aggregationPipeline,
+  modelData,
+];
@@ -0,0 +1,18 @@
+import type { SimpleEvalCase } from '../assistant.eval';
+
+const evalCase: SimpleEvalCase = {
+  input: 'How do I model data with MongoDB?',
+  expected: `Data modeling in MongoDB is highly dependent on how you access your data. To ensure that your data model has a logical structure and achieves optimal performance, plan your schema prior to using your database at a production scale. To determine your data model, use the following schema design process:
+
+Identify your workload: Identify the operations that your application runs most frequently
+Map relationships: Identify the relationships in your application's data and decide whether to link or embed related data.
+Apply design patterns: Apply schema design patterns to optimize reads and writes.
+Create indexes: Create indexes to support common query patterns.
+`,
+  expectedSources: [
+    'https://www.mongodb.com/docs/manual/data-modeling/#plan-your-schema',
+    'https://www.mongodb.com/docs/manual/data-modeling/schema-design-process/#designing-your-schema',
+  ],
+};
+
+export default evalCase;