Skip to content

Commit a6c8750

Browse files
add screenshots to evals
1 parent ae514f5 commit a6c8750

File tree

4 files changed

+218
-3
lines changed

4 files changed

+218
-3
lines changed

evals/evaluator.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
EvaluateOptions,
1616
BatchAskOptions,
1717
EvaluationResult,
18+
EvaluateWithScreenshotsOptions,
1819
} from "@/types/evaluator";
1920
import { LLMParsedResponse } from "@/lib/inference";
2021
import { LLMResponse } from "@/lib/llm/LLMClient";
@@ -240,4 +241,84 @@ export class Evaluator {
240241
}));
241242
}
242243
}
244+
245+
/**
246+
* Evaluates a question using multiple screenshots captured during execution.
247+
* This method processes all screenshots to understand the full journey and context.
248+
*
249+
* @param options - The options for screenshot-based evaluation
250+
* @returns A promise that resolves to an EvaluationResult
251+
*/
252+
async evaluateWithScreenshots(
253+
options: EvaluateWithScreenshotsOptions,
254+
): Promise<EvaluationResult> {
255+
const {
256+
question,
257+
screenshots,
258+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
259+
Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
260+
Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
261+
Be critical about the question but consider the ENTIRE sequence when making your determination.
262+
Today's date is ${new Date().toLocaleDateString()}`,
263+
} = options;
264+
265+
if (!question) {
266+
throw new Error("Question cannot be an empty string");
267+
}
268+
269+
if (!screenshots || screenshots.length === 0) {
270+
throw new Error("At least one screenshot must be provided");
271+
}
272+
273+
const llmClient = this.stagehand.llmProvider.getClient(
274+
this.modelName,
275+
this.modelClientOptions,
276+
);
277+
278+
const imageContents = screenshots.map((screenshot) => ({
279+
type: "image_url" as const,
280+
image_url: {
281+
url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
282+
},
283+
}));
284+
285+
const response = await llmClient.createChatCompletion<
286+
LLMParsedResponse<LLMResponse>
287+
>({
288+
logger: this.silentLogger,
289+
options: {
290+
messages: [
291+
{ role: "system", content: systemPrompt },
292+
{
293+
role: "user",
294+
content: [
295+
{
296+
type: "text",
297+
text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
298+
},
299+
...imageContents,
300+
],
301+
},
302+
],
303+
response_model: {
304+
name: "EvaluationResult",
305+
schema: EvaluationSchema,
306+
},
307+
},
308+
});
309+
310+
try {
311+
const result = response.data as unknown as z.infer<
312+
typeof EvaluationSchema
313+
>;
314+
return { evaluation: result.evaluation, reasoning: result.reasoning };
315+
} catch (error) {
316+
const errorMessage =
317+
error instanceof Error ? error.message : String(error);
318+
return {
319+
evaluation: "INVALID" as const,
320+
reasoning: `Failed to get structured response: ${errorMessage}`,
321+
};
322+
}
323+
}
243324
}

evals/tasks/agent/webvoyager.ts

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
34

45
export const webvoyager: EvalFunction = async ({
56
stagehand,
@@ -35,20 +36,39 @@ export const webvoyager: EvalFunction = async ({
3536
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
3637
});
3738

39+
// Start collecting screenshots in parallel
40+
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
41+
interval: 2000, // Capture every 2 seconds
42+
maxScreenshots: 10, // Keep last 10 screenshots
43+
captureOnNavigation: true, // Also capture on page navigation
44+
});
45+
46+
screenshotCollector.start();
47+
3848
await agent.execute({
3949
instruction: params.ques,
4050
maxSteps: 50,
4151
});
4252

53+
// Stop collecting and get all screenshots
54+
const screenshots = screenshotCollector.stop();
55+
56+
logger.log({
57+
category: "evaluation",
58+
message: `Collected ${screenshots.length} screenshots for evaluation`,
59+
level: 1,
60+
});
61+
4362
const evaluator = new Evaluator(stagehand);
44-
const evalResult = await evaluator.ask({
45-
question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
46-
screenshot: true,
63+
const evalResult = await evaluator.evaluateWithScreenshots({
64+
question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
65+
screenshots: screenshots,
4766
});
4867

4968
return {
5069
_success: evalResult.evaluation === "YES",
5170
reasoning: evalResult.reasoning,
71+
screenshotCount: screenshots.length,
5272
debugUrl,
5373
sessionUrl,
5474
logs: logger.getLogs(),

evals/utils/ScreenshotCollector.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import { Page } from "@playwright/test";
2+
3+
export interface ScreenshotCollectorOptions {
4+
interval?: number;
5+
maxScreenshots?: number;
6+
captureOnNavigation?: boolean;
7+
}
8+
9+
export class ScreenshotCollector {
10+
private screenshots: Buffer[] = [];
11+
private page: Page;
12+
private interval: number;
13+
private maxScreenshots: number;
14+
private captureOnNavigation: boolean;
15+
private intervalId?: NodeJS.Timeout;
16+
private navigationListeners: Array<() => void> = [];
17+
private isCapturing: boolean = false;
18+
19+
constructor(page: Page, options: ScreenshotCollectorOptions = {}) {
20+
this.page = page;
21+
this.interval = options.interval || 2000;
22+
this.maxScreenshots = options.maxScreenshots || 10;
23+
this.captureOnNavigation = options.captureOnNavigation ?? true;
24+
}
25+
26+
start(): void {
27+
if (this.intervalId) {
28+
return;
29+
}
30+
31+
this.intervalId = setInterval(async () => {
32+
await this.captureScreenshot("interval");
33+
}, this.interval);
34+
35+
if (this.captureOnNavigation) {
36+
const loadListener = () => this.captureScreenshot("load");
37+
const domContentListener = () =>
38+
this.captureScreenshot("domcontentloaded");
39+
40+
this.page.on("load", loadListener);
41+
this.page.on("domcontentloaded", domContentListener);
42+
43+
this.navigationListeners = [
44+
() => this.page.off("load", loadListener),
45+
() => this.page.off("domcontentloaded", domContentListener),
46+
];
47+
}
48+
49+
this.captureScreenshot("initial");
50+
}
51+
52+
stop(): Buffer[] {
53+
if (this.intervalId) {
54+
clearInterval(this.intervalId);
55+
this.intervalId = undefined;
56+
}
57+
58+
this.navigationListeners.forEach((removeListener) => removeListener());
59+
this.navigationListeners = [];
60+
61+
this.captureScreenshot("final");
62+
63+
return this.getScreenshots();
64+
}
65+
66+
private async captureScreenshot(trigger: string): Promise<void> {
67+
if (this.isCapturing) {
68+
return;
69+
}
70+
71+
this.isCapturing = true;
72+
73+
try {
74+
const screenshot = await this.page.screenshot();
75+
this.screenshots.push(screenshot);
76+
77+
if (this.screenshots.length > this.maxScreenshots) {
78+
this.screenshots.shift();
79+
}
80+
81+
console.log(
82+
`Screenshot captured (trigger: ${trigger}), total: ${this.screenshots.length}`,
83+
);
84+
} catch (error) {
85+
console.error(`Failed to capture screenshot (${trigger}):`, error);
86+
} finally {
87+
this.isCapturing = false;
88+
}
89+
}
90+
91+
getScreenshots(): Buffer[] {
92+
return [...this.screenshots];
93+
}
94+
95+
getScreenshotCount(): number {
96+
return this.screenshots.length;
97+
}
98+
99+
clear(): void {
100+
this.screenshots = [];
101+
}
102+
}

types/evaluator.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,15 @@ export interface EvaluationResult {
3838
*/
3939
reasoning: string;
4040
}
41+
42+
/**
43+
* Options for evaluating with multiple screenshots
44+
*/
45+
export type EvaluateWithScreenshotsOptions = {
46+
/** The question to ask about the task state */
47+
question: string;
48+
/** Array of screenshots captured during task execution */
49+
screenshots: Buffer[];
50+
/** Custom system prompt for the evaluator */
51+
systemPrompt?: string;
52+
};

0 commit comments

Comments
 (0)