Skip to content

Commit 8422828

Browse files
authored
Add more agent evals (#961)
part of STG-653 # why Adds more evals to agent # what changed Added ~ 15 new evals # test plan - tested locally - tested on browserbase
1 parent 1ecc5c2 commit 8422828

31 files changed

+901
-77
lines changed

.changeset/rich-colts-march.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Add more evals for stagehand agent

evals/evals.config.json

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,34 @@
323323
"name": "agent/google_flights",
324324
"categories": ["agent"]
325325
},
326+
{
327+
"name": "agent/github_react_version",
328+
"categories": ["agent"]
329+
},
330+
{
331+
"name": "agent/steam_games",
332+
"categories": ["agent"]
333+
},
334+
{
335+
"name": "agent/ubereats",
336+
"categories": ["agent"]
337+
},
338+
{
339+
"name": "agent/kith",
340+
"categories": ["agent"]
341+
},
342+
{
343+
"name": "agent/apple_tv",
344+
"categories": ["agent"]
345+
},
346+
{
347+
"name": "agent/apple_trade_in",
348+
"categories": ["agent"]
349+
},
350+
{
351+
"name": "agent/arxiv_gpt_report",
352+
"categories": ["agent"]
353+
},
326354
{
327355
"name": "agent/sf_library_card",
328356
"categories": ["agent"]
@@ -331,6 +359,14 @@
331359
"name": "agent/sf_library_card_multiple",
332360
"categories": ["agent"]
333361
},
362+
{
363+
"name": "agent/hugging_face",
364+
"categories": ["agent"]
365+
},
366+
{
367+
"name": "agent/google_maps_3",
368+
"categories": ["agent"]
369+
},
334370
{
335371
"name": "login",
336372
"categories": ["act", "regression"]
@@ -463,5 +499,26 @@
463499
"name": "iframe_scroll",
464500
"categories": ["act"]
465501
}
502+
,
503+
{
504+
"name": "agent/nba_trades",
505+
"categories": ["agent"]
506+
},
507+
{
508+
"name": "agent/hotel_booking",
509+
"categories": ["agent"]
510+
},
511+
{
512+
"name": "agent/github",
513+
"categories": ["agent"]
514+
},
515+
{
516+
"name": "agent/all_recipes",
517+
"categories": ["agent"]
518+
},
519+
{
520+
"name": "agent/google_shopping",
521+
"categories": ["agent"]
522+
}
466523
]
467524
}

evals/initStagehand.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {
1818
} from "@browserbasehq/stagehand";
1919
import { EvalLogger } from "./logger";
2020
import type { StagehandInitResult } from "@/types/evals";
21+
import type { AgentConfig } from "@/dist";
2122
import { AvailableModel } from "@browserbasehq/stagehand";
2223

2324
/**
@@ -104,12 +105,26 @@ export const initStagehand = async ({
104105
// Set navigation timeout to 60 seconds for evaluations
105106
stagehand.context.setDefaultNavigationTimeout(60_000);
106107

108+
const isCUAModel = (model: string): boolean =>
109+
model.includes("computer-use-preview") || model.startsWith("claude");
110+
111+
let agentConfig: AgentConfig | undefined;
112+
if (isCUAModel(modelName)) {
113+
agentConfig = {
114+
model: modelName,
115+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
116+
} as AgentConfig;
117+
}
118+
119+
const agent = stagehand.agent(agentConfig);
120+
107121
return {
108122
stagehand,
109123
stagehandConfig: config,
110124
logger,
111125
debugUrl,
112126
sessionUrl,
113127
modelName,
128+
agent,
114129
};
115130
};

evals/taskConfig.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
106106

107107
const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
108108
? process.env.EVAL_AGENT_MODELS.split(",")
109-
: ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"];
109+
: ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];
110110

111111
/**
112112
* getModelList:

evals/tasks/agent/all_recipes.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { Evaluator } from "@/evals/evaluator";
2+
import { EvalFunction } from "@/types/evals";
3+
4+
export const all_recipes: EvalFunction = async ({
5+
debugUrl,
6+
sessionUrl,
7+
stagehand,
8+
logger,
9+
agent,
10+
}) => {
11+
try {
12+
await stagehand.page.goto("https://www.allrecipes.com/");
13+
const evaluator = new Evaluator(stagehand);
14+
const agentResult = await agent.execute({
15+
instruction:
16+
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
17+
maxSteps: 20,
18+
});
19+
20+
const { evaluation, reasoning } = await evaluator.evaluate({
21+
question: "Did the agent find a recipe for Beef Wellington",
22+
});
23+
24+
logger.log(agentResult);
25+
26+
const success =
27+
agentResult.success &&
28+
evaluation === "YES" &&
29+
stagehand.page.url() ===
30+
"https://www.allrecipes.com/recipe/16899/beef-wellington/";
31+
32+
if (!success) {
33+
return {
34+
_success: false,
35+
message: reasoning,
36+
debugUrl,
37+
sessionUrl,
38+
logs: logger.getLogs(),
39+
};
40+
}
41+
42+
return {
43+
_success: true,
44+
debugUrl,
45+
sessionUrl,
46+
logs: logger.getLogs(),
47+
};
48+
} catch (error) {
49+
return {
50+
_success: false,
51+
error,
52+
debugUrl,
53+
sessionUrl,
54+
logs: logger.getLogs(),
55+
};
56+
} finally {
57+
await stagehand.close();
58+
}
59+
};

evals/tasks/agent/apple_trade_in.ts

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//this eval is expected to fail due to issues scrolling within the trade in dialog
2+
import { EvalFunction } from "@/types/evals";
3+
import { z } from "zod";
4+
5+
export const apple_trade_in: EvalFunction = async ({
6+
debugUrl,
7+
sessionUrl,
8+
stagehand,
9+
logger,
10+
agent,
11+
}) => {
12+
try {
13+
await stagehand.page.goto("https://www.apple.com/shop/trade-in");
14+
const agentResult = await agent.execute({
15+
instruction:
16+
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
17+
maxSteps: 30,
18+
});
19+
20+
const { tradeInValue } = await stagehand.page.extract({
21+
modelName: "google/gemini-2.5-flash",
22+
instruction:
23+
"Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
24+
schema: z.object({
25+
tradeInValue: z.number(),
26+
}),
27+
});
28+
29+
const success =
30+
agentResult.success &&
31+
tradeInValue === 360 &&
32+
stagehand.page.url().includes("https://www.apple.com/shop/trade-in");
33+
34+
if (!success) {
35+
return {
36+
_success: false,
37+
message: agentResult.message,
38+
debugUrl,
39+
sessionUrl,
40+
logs: logger.getLogs(),
41+
};
42+
}
43+
return {
44+
_success: true,
45+
debugUrl,
46+
sessionUrl,
47+
logs: logger.getLogs(),
48+
};
49+
} catch (error) {
50+
return {
51+
_success: false,
52+
message: error.message,
53+
debugUrl,
54+
sessionUrl,
55+
logs: logger.getLogs(),
56+
};
57+
} finally {
58+
await stagehand.close();
59+
}
60+
};

evals/tasks/agent/apple_tv.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { z } from "zod";
3+
4+
export const apple_tv: EvalFunction = async ({
5+
debugUrl,
6+
sessionUrl,
7+
stagehand,
8+
logger,
9+
agent,
10+
}) => {
11+
try {
12+
await stagehand.page.goto("https://www.apple.com/");
13+
14+
const agentResult = await agent.execute({
15+
instruction:
16+
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
17+
maxSteps: 30,
18+
});
19+
20+
const { height, width } = await stagehand.page.extract({
21+
modelName: "google/gemini-2.5-flash",
22+
instruction: "Extract the size and weight of the Apple TV 4K",
23+
schema: z.object({
24+
height: z.number().describe("The height of the Apple TV 4K in inches"),
25+
width: z.number().describe("The width of the Apple TV 4K in inches"),
26+
}),
27+
});
28+
29+
const success =
30+
agentResult.success &&
31+
height === 1.2 &&
32+
width === 3.66 &&
33+
stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");
34+
35+
if (!success) {
36+
return {
37+
_success: false,
38+
message: agentResult.message,
39+
debugUrl,
40+
sessionUrl,
41+
logs: logger.getLogs(),
42+
};
43+
}
44+
return {
45+
_success: true,
46+
debugUrl,
47+
sessionUrl,
48+
logs: logger.getLogs(),
49+
};
50+
} catch (error) {
51+
return {
52+
_success: false,
53+
message: error.message,
54+
debugUrl,
55+
sessionUrl,
56+
logs: logger.getLogs(),
57+
};
58+
} finally {
59+
await stagehand.close();
60+
}
61+
};

evals/tasks/agent/arxiv_gpt_report.ts

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//agent often fails on this one,
2+
import { EvalFunction } from "@/types/evals";
3+
import { z } from "zod";
4+
export const arxiv_gpt_report: EvalFunction = async ({
5+
debugUrl,
6+
sessionUrl,
7+
stagehand,
8+
logger,
9+
agent,
10+
}) => {
11+
try {
12+
await stagehand.page.goto("https://arxiv.org/");
13+
14+
const agentResult = await agent.execute({
15+
instruction:
16+
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
17+
maxSteps: 20,
18+
});
19+
20+
// Mon, 27 Mar 2023 17:46:54 UTC
21+
const { date } = await stagehand.page.extract({
22+
modelName: "google/gemini-2.5-flash",
23+
instruction:
24+
"Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
25+
schema: z.object({
26+
date: z.string().describe("The date of the v3 submission history"),
27+
}),
28+
});
29+
30+
console.log(`date: ${date}`);
31+
32+
const success = agentResult.success && date === "03-27-2023";
33+
34+
if (!success) {
35+
return {
36+
_success: false,
37+
message: agentResult.message,
38+
debugUrl,
39+
sessionUrl,
40+
logs: logger.getLogs(),
41+
};
42+
}
43+
return {
44+
_success: true,
45+
debugUrl,
46+
sessionUrl,
47+
logs: logger.getLogs(),
48+
};
49+
} catch (error) {
50+
return {
51+
_success: false,
52+
message: error.message,
53+
debugUrl,
54+
sessionUrl,
55+
logs: logger.getLogs(),
56+
};
57+
} finally {
58+
await stagehand.close();
59+
}
60+
};

0 commit comments

Comments
 (0)