browserbase
diff --git a/‎.changeset/chilly-laws-smile.md
Lines changed: 5 additions & 0 deletions b/‎.changeset/chilly-laws-smile.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎.changeset/cruel-onions-live.md
Lines changed: 0 additions & 5 deletions b/‎.changeset/cruel-onions-live.md
Lines changed: 0 additions & 5 deletions
diff --git a/‎.changeset/neat-walls-walk.md
Lines changed: 0 additions & 5 deletions b/‎.changeset/neat-walls-walk.md
Lines changed: 0 additions & 5 deletions
diff --git a/‎.changeset/slimy-cars-matter.md
Lines changed: 5 additions & 0 deletions b/‎.changeset/slimy-cars-matter.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/bug_report.md
Lines changed: 76 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/bug_report.md
Lines changed: 76 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/feature_request.md
Lines changed: 23 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/feature_request.md
Lines changed: 23 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yml
Lines changed: 74 additions & 1 deletion b/‎.github/workflows/ci.yml
Lines changed: 74 additions & 1 deletion
diff --git a/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎evals/CHANGELOG.md
Lines changed: 7 additions & 0 deletions b/‎evals/CHANGELOG.md
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+add webvoyager evals
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+add support for custom baseUrl within openai provider
@@ -0,0 +1,76 @@
+---
+name: Bug report
+about: Detailed descriptions help us resolve faster
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Before submitting an issue, please:**
+
+- [ ]  Check the [documentation](https://docs.stagehand.dev/) for relevant information
+- [ ]  Search existing [issues](https://github.com/browserbase/stagehand/issues) to avoid duplicates
+
+## Environment Information
+
+Please provide the following information to help us reproduce and resolve your issue:
+
+**Stagehand:**
+
+- Language/SDK: [TypeScript, Python, MCP…]
+- Stagehand version: [e.g., 1.0.0]
+
+**AI Provider:**
+
+- Provider: [e.g., OpenAI, Anthropic, Azure OpenAI]
+- Model: [e.g., gpt-4o, claude-3-7-sonnet-latest]
+
+## Issue Description
+
+```
+[Describe the current behavior here]
+
+```
+
+### Steps to Reproduce
+
+1. 
+2. 
+3. 
+
+### Minimal Reproduction Code
+
+```tsx
+// Your minimal reproduction code here
+import { Stagehand } from '@browserbase/stagehand';
+
+const stagehand = new Stagehand({
+  // IMPORTANT: include your stagehand config
+});
+
+// Steps that reproduce the issue
+
+```
+
+### Error Messages / Log trace
+
+```
+[Paste error messages/logs here]
+
+```
+
+### Screenshots / Videos
+
+```
+[Attach screenshots or videos here]
+
+```
+
+### Related Issues
+
+Are there any related issues or PRs?
+
+- Related to: #[issue number]
+- Duplicate of: #[issue number]
+- Blocks: #[issue number]
@@ -0,0 +1,23 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Are you willing to contribute to implementing this feature or fix?**
+
+- [ ]  Yes, I can submit a PR
+- [ ]  Yes, but I need guidance
+- [ ]  No, I cannot contribute at this time
@@ -12,7 +12,7 @@ on:
 
 env:
   EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
-  EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract"
+  EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent"
   EVAL_MAX_CONCURRENCY: 25
   EVAL_TRIAL_COUNT: 5
 
@@ -29,6 +29,7 @@ jobs:
       run-act: ${{ steps.check-labels.outputs.run-act }}
       run-observe: ${{ steps.check-labels.outputs.run-observe }}
       run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
+      run-agent: ${{ steps.check-labels.outputs.run-agent }}
     steps:
       - id: check-labels
         run: |
@@ -40,6 +41,7 @@ jobs:
             echo "run-act=true" >> $GITHUB_OUTPUT
             echo "run-observe=true" >> $GITHUB_OUTPUT
             echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
+            echo "run-agent=true" >> $GITHUB_OUTPUT
             exit 0
           fi
 
@@ -49,6 +51,7 @@ jobs:
           echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
           echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
           echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
+          echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT
 
   run-lint:
     runs-on: ubuntu-latest
@@ -562,3 +565,73 @@ jobs:
             echo "Eval summary not found for targeted_extract category. Failing CI."
             exit 1
           fi
+
+  run-agent-evals:
+    needs: [run-targeted-extract-evals, determine-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 90 # Agent evals can be long-running
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+      # Use agent models for agent evals in CI
+      EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest"
+      EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals
+      EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Check for 'agent' label
+        id: label-check
+        run: |
+          if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then
+            echo "has_label=false" >> $GITHUB_OUTPUT
+            echo "No label for AGENT. Exiting with success."
+          else
+            echo "has_label=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Set up Node.js
+        if: needs.determine-evals.outputs.run-agent == 'true'
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        if: needs.determine-evals.outputs.run-agent == 'true'
+        run: |
+          rm -rf node_modules
+          npm i -g pnpm
+          pnpm install --no-frozen-lockfile
+
+      - name: Build Stagehand
+        if: needs.determine-evals.outputs.run-agent == 'true'
+        run: pnpm run build
+
+      - name: Run Agent Evals
+        if: needs.determine-evals.outputs.run-agent == 'true'
+        run: pnpm run evals category agent
+
+      - name: Log Agent Evals Performance
+        if: needs.determine-evals.outputs.run-agent == 'true'
+        run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
+          if [ -f eval-summary.json ]; then
+            agent_score=$(jq '.categories.agent' eval-summary.json)
+            echo "Agent category score: $agent_score%"
+            # Lower threshold for agent evals since they're complex
+            if (( $(echo "$agent_score < 50" | bc -l) )); then
+              echo "Agent category score is below 50%. Failing CI."
+              exit 1
+            fi
+          else
+            echo "Eval summary not found for agent category. Failing CI."
+            exit 1
+          fi
@@ -1,5 +1,11 @@
 # @browserbasehq/stagehand
 
+## 2.4.4
+
+### Patch Changes
+
+- [#1012](https://github.com/browserbase/stagehand/pull/1012) [`9e8c173`](https://github.com/browserbase/stagehand/commit/9e8c17374fdc8fbe7f26e6cf802c36bd14f11039) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix disabling api validation whenever a customLLM client is provided
+
 ## 2.4.3
 
 ### Patch Changes
 
@@ -125,6 +125,7 @@ pnpm playwright install
 pnpm run build
 pnpm run example # run the blank script at ./examples/example.ts
 pnpm run example 2048 # run the 2048 example at ./examples/2048.ts
+pnpm run evals -man # see evaluation suite options
 ```
 
 Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run:
 
@@ -1,5 +1,12 @@
 # @browserbasehq/stagehand-evals
 
+## 1.0.8
+
+### Patch Changes
+
+- Updated dependencies [[`9e8c173`](https://github.com/browserbase/stagehand/commit/9e8c17374fdc8fbe7f26e6cf802c36bd14f11039)]:
+  - @browserbasehq/stagehand@2.4.4
+
 ## 1.0.7
 
 ### Patch Changes
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@browserbasehq/stagehand": patch
 +---
++
 +add webvoyager evals