Skip to content

Commit e94850d

Browse files
committed
Merge remote-tracking branch 'origin/az-sequence-to-trace' into add_agent_names
2 parents 9687422 + 3684de3 commit e94850d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+207
-521
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Judgeval is an open-source tool for testing, monitoring, and optimizing AI agent
1717
**🔍 Tracing**
1818
* Automatic agent tracing for common agent frameworks and SDKs (LangGraph, OpenAI, Anthropic, etc.)
1919
* Track input/output, latency, cost, token usage at every step
20+
* Granular cost tracking per customer/per task
2021
* Function tracing with `@judgment.observe` decorator
2122

2223
**🧪 Evals**

docs/api_reference/judgment_client.mdx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ example = Example(
5252
results = client.run_evaluation(
5353
examples=[example],
5454
scorers=[FaithfulnessScorer(threshold=0.5)],
55-
model="gpt-4o",
55+
model="gpt-4.1",
5656
)
5757
```
5858
```Typescript Typescript
@@ -70,7 +70,7 @@ async function runEval() {
7070
const results = await client.evaluate({
7171
examples: [example],
7272
scorers: [new FaithfulnessScorer(0.5)],
73-
model: "gpt-4o",
73+
model: "gpt-4.1",
7474
projectName: "client-api-ref-proj", // Optional: Provide a project name
7575
evalName: "client-api-ref-eval" // Optional: Provide an eval name
7676
});
@@ -83,7 +83,7 @@ runEval();
8383

8484
The `run_evaluation` (Python) / `evaluate` (Typescript) method accepts the following arguments/options:
8585
- `examples`: A list/array of [Example](/evaluation/data_examples) objects to evaluate.
86-
- `model`: The model to use for the evaluation, such as `gpt-4o` or `Qwen/Qwen2.5-72B-Instruct-Turbo`.
86+
- `model`: The model to use for the evaluation, such as `gpt-4.1` or `Qwen/Qwen2.5-72B-Instruct-Turbo`.
8787
- `scorers`: A list/array of [Scorer](/evaluation/scorers) objects to use for the evaluation.
8888
- `log_results` (Python) / `logResults` (Typescript): Whether to log the results of the evaluation to the Judgment platform. Defaults to `true`.
8989
- `override`: Whether to override an existing evaluation with the same name. Defaults to `false`.
@@ -135,7 +135,7 @@ airline_sequence = Sequence(
135135
results = client.run_sequence_evaluation(
136136
sequences=[airline_sequence],
137137
scorers=[DerailmentScorer(threshold=0.5)],
138-
model="gpt-4o",
138+
model="gpt-4.1",
139139
log_results=True,
140140
override=True,
141141
)

docs/evaluation/data_datasets.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ from judgeval.scorers import FaithfulnessScorer # Added import
253253
res = client.run_evaluation(
254254
examples=dataset.examples,
255255
scorers=[FaithfulnessScorer(threshold=0.9)],
256-
model="gpt-4o",
256+
model="gpt-4.1",
257257
)
258258
```
259259
```Typescript Typescript
@@ -270,7 +270,7 @@ const dataset: Example[] = [
270270
const results = await client.evaluate({
271271
examples: dataset,
272272
scorers: [new FaithfulnessScorer(0.9)],
273-
model: "gpt-4o",
273+
model: "gpt-4.1",
274274
projectName: "dataset-eval-ts-proj",
275275
evalName: "dataset-eval-ts-run"
276276
});

docs/evaluation/introduction.mdx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,14 @@ faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
7070
results = client.run_evaluation(
7171
examples=[example],
7272
scorers=[faithfulness_scorer],
73-
model="gpt-4o",
73+
model="gpt-4.1",
7474
)
7575

7676
# You also run evaluations asynchronously like so:
7777
results = client.a_run_evaluation(
7878
examples=[example],
7979
scorers=[faithfulness_scorer],
80-
model="gpt-4o",
80+
model="gpt-4.1",
8181
)
8282
print(results)
8383
```
@@ -102,7 +102,7 @@ const faithfulnessScorer = new FaithfulnessScorer(0.5);
102102
const results = await client.evaluate({
103103
examples: [example],
104104
scorers: [faithfulnessScorer],
105-
model: "gpt-4o",
105+
model: "gpt-4.1",
106106
projectName: "my-intro-project",
107107
evalName: "intro-evaluation-run"
108108
});

docs/evaluation/judges.mdx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Both `judgeval` (Python) and `judgeval-js` (TypeScript) support OpenAI models (l
1515

1616
In Python, this is handled via LiteLLM integration. In TypeScript, the built-in `DefaultJudge` is used.
1717

18-
You simply pass the model name (e.g., "gpt-4o") to the `model` parameter in your evaluation call:
18+
You simply pass the model name (e.g., "gpt-4.1") to the `model` parameter in your evaluation call:
1919

2020
<CodeGroup>
2121
```Python Python
@@ -29,7 +29,7 @@ example1 = Example(input="Q1", actual_output="A1")
2929
results = client.run_evaluation(
3030
examples=[example1],
3131
scorers=[AnswerRelevancyScorer(threshold=0.5)],
32-
model="gpt-4o" # Uses LiteLLM
32+
model="gpt-4.1" # Uses LiteLLM
3333
)
3434
```
3535
```Typescript Typescript
@@ -42,7 +42,7 @@ async function runOpenAIJudge() {
4242
const results = await client.evaluate({
4343
examples: [example1],
4444
scorers: [new AnswerRelevancyScorer(0.5)],
45-
model: "gpt-4o", // Uses DefaultJudge internally
45+
model: "gpt-4.1", // Uses DefaultJudge internally
4646
projectName: "openai-judge-ts-proj",
4747
evalName: "openai-judge-ts-eval"
4848
});
@@ -205,5 +205,5 @@ useCustomJudge();
205205
</CodeGroup>
206206

207207
<Note>
208-
When providing a custom judge instance (like `VertexAIJudge` in Python or `MyCustomJudge` in TypeScript), pass the instance directly to the `model` parameter (Python) or the `judge` option (TypeScript) in the evaluation call. The built-in judges (`DefaultJudge`, `TogetherJudge`) are used automatically when you pass a model *name* string (like "gpt-4o" or "meta-llama/...") to the `model` option in TypeScript.
208+
When providing a custom judge instance (like `VertexAIJudge` in Python or `MyCustomJudge` in TypeScript), pass the instance directly to the `model` parameter (Python) or the `judge` option (TypeScript) in the evaluation call. The built-in judges (`DefaultJudge`, `TogetherJudge`) are used automatically when you pass a model *name* string (like "gpt-4.1" or "meta-llama/...") to the `model` option in TypeScript.
209209
</Note>

docs/evaluation/scorers/agent/derailment.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ airline_sequence = Sequence(
4343
results = client.run_sequence_evaluation(
4444
sequences=[airline_sequence],
4545
scorers=[DerailmentScorer(threshold=0.5)],
46-
model="gpt-4o",
46+
model="gpt-4.1",
4747
log_results=True,
4848
override=True,
4949
)

docs/evaluation/scorers/classifier_scorer.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ They can also be run in conjunction with other scorers in a single evaluation ru
7070
results = client.run_evaluation(
7171
examples=[example1],
7272
scorers=[friendliness_scorer],
73-
model="gpt-4o"
73+
model="gpt-4.1"
7474
)
7575
```
7676

docs/evaluation/scorers/custom_scorers.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ sample_scorer = SampleScorer()
318318
results = client.run_evaluation(
319319
examples=[example1],
320320
scorers=[sample_scorer],
321-
model="gpt-4o"
321+
model="gpt-4.1"
322322
)
323323
```
324324
## Custom Scorers with Custom Examples
@@ -350,7 +350,7 @@ scorer = CustomScorer(threshold=0.5) # Your custom scorer
350350
results = client.run_evaluation(
351351
examples=[custom_example],
352352
scorers=[scorer],
353-
model="gpt-4o-mini",
353+
model="gpt-4.1-mini",
354354
)
355355
```
356356

docs/evaluation/scorers/default/answer_correctness.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ scorer = AnswerCorrectnessScorer(threshold=0.8)
4646
results = client.run_evaluation(
4747
examples=[example],
4848
scorers=[scorer],
49-
model="gpt-4o",
49+
model="gpt-4.1",
5050
)
5151
print(results)
5252
```
@@ -69,7 +69,7 @@ async function runAnswerCorrectness() {
6969
const results = await client.evaluate({
7070
examples: [example],
7171
scorers: [scorer],
72-
model: "gpt-4o",
72+
model: "gpt-4.1",
7373
projectName: "ans-correct-ts-proj",
7474
evalName: "ans-correct-ts-eval"
7575
});

docs/evaluation/scorers/default/answer_relevancy.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ scorer = AnswerRelevancyScorer(threshold=0.8)
4747
results = client.run_evaluation(
4848
examples=[example],
4949
scorers=[scorer],
50-
model="gpt-4o",
50+
model="gpt-4.1",
5151
)
5252
print(results)
5353
```
@@ -68,7 +68,7 @@ async function runAnswerRelevancy() {
6868
const results = await client.evaluate({
6969
examples: [example],
7070
scorers: [scorer],
71-
model: "gpt-4o",
71+
model: "gpt-4.1",
7272
projectName: "ans-relevancy-ts-proj",
7373
evalName: "ans-relevancy-ts-eval"
7474
});

0 commit comments

Comments
 (0)