add screenshots to evals

filip-michalsky · filip-michalsky · commit a6c8750c6922 · 2025-08-28T17:45:36.000-04:00
diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -15,6 +15,7 @@ import {
   EvaluateOptions,
   BatchAskOptions,
   EvaluationResult,
+  EvaluateWithScreenshotsOptions,
 } from "@/types/evaluator";
 import { LLMParsedResponse } from "@/lib/inference";
 import { LLMResponse } from "@/lib/llm/LLMClient";
@@ -240,4 +241,84 @@ export class Evaluator {
       }));
     }
   }
+
+  /**
+   * Evaluates a question using multiple screenshots captured during execution.
+   * This method processes all screenshots to understand the full journey and context.
+   *
+   * @param options - The options for screenshot-based evaluation
+   * @returns A promise that resolves to an EvaluationResult
+   */
+  async evaluateWithScreenshots(
+    options: EvaluateWithScreenshotsOptions,
+  ): Promise<EvaluationResult> {
+    const {
+      question,
+      screenshots,
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
+        Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
+        Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
+        Be critical about the question but consider the ENTIRE sequence when making your determination.
+        Today's date is ${new Date().toLocaleDateString()}`,
+    } = options;
+
+    if (!question) {
+      throw new Error("Question cannot be an empty string");
+    }
+
+    if (!screenshots || screenshots.length === 0) {
+      throw new Error("At least one screenshot must be provided");
+    }
+
+    const llmClient = this.stagehand.llmProvider.getClient(
+      this.modelName,
+      this.modelClientOptions,
+    );
+
+    const imageContents = screenshots.map((screenshot) => ({
+      type: "image_url" as const,
+      image_url: {
+        url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
+      },
+    }));
+
+    const response = await llmClient.createChatCompletion<
+      LLMParsedResponse<LLMResponse>
+    >({
+      logger: this.silentLogger,
+      options: {
+        messages: [
+          { role: "system", content: systemPrompt },
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
+              },
+              ...imageContents,
+            ],
+          },
+        ],
+        response_model: {
+          name: "EvaluationResult",
+          schema: EvaluationSchema,
+        },
+      },
+    });
+
+    try {
+      const result = response.data as unknown as z.infer<
+        typeof EvaluationSchema
+      >;
+      return { evaluation: result.evaluation, reasoning: result.reasoning };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      return {
+        evaluation: "INVALID" as const,
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      };
+    }
+  }
 }
diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts
@@ -1,5 +1,6 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const webvoyager: EvalFunction = async ({
   stagehand,
@@ -35,20 +36,39 @@ export const webvoyager: EvalFunction = async ({
       instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
     });
 
+    // Start collecting screenshots in parallel
+    const screenshotCollector = new ScreenshotCollector(stagehand.page, {
+      interval: 2000, // Capture every 2 seconds
+      maxScreenshots: 10, // Keep last 10 screenshots
+      captureOnNavigation: true, // Also capture on page navigation
+    });
+
+    screenshotCollector.start();
+
     await agent.execute({
       instruction: params.ques,
       maxSteps: 50,
     });
 
+    // Stop collecting and get all screenshots
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
     const evaluator = new Evaluator(stagehand);
-    const evalResult = await evaluator.ask({
-      question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
-      screenshot: true,
+    const evalResult = await evaluator.evaluateWithScreenshots({
+      question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
+      screenshots: screenshots,
     });
 
     return {
       _success: evalResult.evaluation === "YES",
       reasoning: evalResult.reasoning,
+      screenshotCount: screenshots.length,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
diff --git a/evals/utils/ScreenshotCollector.ts b/evals/utils/ScreenshotCollector.ts
@@ -0,0 +1,102 @@
+import { Page } from "@playwright/test";
+
+export interface ScreenshotCollectorOptions {
+  interval?: number;
+  maxScreenshots?: number;
+  captureOnNavigation?: boolean;
+}
+
+export class ScreenshotCollector {
+  private screenshots: Buffer[] = [];
+  private page: Page;
+  private interval: number;
+  private maxScreenshots: number;
+  private captureOnNavigation: boolean;
+  private intervalId?: NodeJS.Timeout;
+  private navigationListeners: Array<() => void> = [];
+  private isCapturing: boolean = false;
+
+  constructor(page: Page, options: ScreenshotCollectorOptions = {}) {
+    this.page = page;
+    this.interval = options.interval || 2000;
+    this.maxScreenshots = options.maxScreenshots || 10;
+    this.captureOnNavigation = options.captureOnNavigation ?? true;
+  }
+
+  start(): void {
+    if (this.intervalId) {
+      return;
+    }
+
+    this.intervalId = setInterval(async () => {
+      await this.captureScreenshot("interval");
+    }, this.interval);
+
+    if (this.captureOnNavigation) {
+      const loadListener = () => this.captureScreenshot("load");
+      const domContentListener = () =>
+        this.captureScreenshot("domcontentloaded");
+
+      this.page.on("load", loadListener);
+      this.page.on("domcontentloaded", domContentListener);
+
+      this.navigationListeners = [
+        () => this.page.off("load", loadListener),
+        () => this.page.off("domcontentloaded", domContentListener),
+      ];
+    }
+
+    this.captureScreenshot("initial");
+  }
+
+  stop(): Buffer[] {
+    if (this.intervalId) {
+      clearInterval(this.intervalId);
+      this.intervalId = undefined;
+    }
+
+    this.navigationListeners.forEach((removeListener) => removeListener());
+    this.navigationListeners = [];
+
+    this.captureScreenshot("final");
+
+    return this.getScreenshots();
+  }
+
+  private async captureScreenshot(trigger: string): Promise<void> {
+    if (this.isCapturing) {
+      return;
+    }
+
+    this.isCapturing = true;
+
+    try {
+      const screenshot = await this.page.screenshot();
+      this.screenshots.push(screenshot);
+
+      if (this.screenshots.length > this.maxScreenshots) {
+        this.screenshots.shift();
+      }
+
+      console.log(
+        `Screenshot captured (trigger: ${trigger}), total: ${this.screenshots.length}`,
+      );
+    } catch (error) {
+      console.error(`Failed to capture screenshot (${trigger}):`, error);
+    } finally {
+      this.isCapturing = false;
+    }
+  }
+
+  getScreenshots(): Buffer[] {
+    return [...this.screenshots];
+  }
+
+  getScreenshotCount(): number {
+    return this.screenshots.length;
+  }
+
+  clear(): void {
+    this.screenshots = [];
+  }
+}
diff --git a/types/evaluator.ts b/types/evaluator.ts
@@ -38,3 +38,15 @@ export interface EvaluationResult {
    */
   reasoning: string;
 }
+
+/**
+ * Options for evaluating with multiple screenshots
+ */
+export type EvaluateWithScreenshotsOptions = {
+  /** The question to ask about the task state */
+  question: string;
+  /** Array of screenshots captured during task execution */
+  screenshots: Buffer[];
+  /** Custom system prompt for the evaluator */
+  systemPrompt?: string;
+};