Skip to content

Retrieve all user datasets #65

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,090 changes: 192 additions & 898 deletions Pipfile.lock

Large diffs are not rendered by default.

43 changes: 35 additions & 8 deletions src/e2etests/judgment_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
)
from judgeval.judges import TogetherJudge, JudgevalJudge
from playground import CustomFaithfulnessMetric
from judgeval.data.datasets.dataset import EvalDataset
from judgeval.data.datasets.dataset import EvalDataset, GroundTruthExample
from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
from judgeval.scorers.prompt_scorer import ClassifierScorer

# Configure logging
Expand Down Expand Up @@ -62,6 +63,30 @@ def test_dataset(self, client: JudgmentClient):
dataset = client.pull_dataset(alias="test_dataset_5")
assert dataset, "Failed to pull dataset"

def test_pull_all_user_dataset_stats(self, client: JudgmentClient):
dataset: EvalDataset = client.create_dataset()
dataset.add_example(Example(input="input 1", actual_output="output 1"))
dataset.add_example(Example(input="input 2", actual_output="output 2"))
dataset.add_example(Example(input="input 3", actual_output="output 3"))
random_name1 = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
client.push_dataset(alias=random_name1, dataset=dataset, overwrite=False)

dataset: EvalDataset = client.create_dataset()
dataset.add_example(Example(input="input 1", actual_output="output 1"))
dataset.add_example(Example(input="input 2", actual_output="output 2"))
dataset.add_ground_truth(GroundTruthExample(input="input 1", actual_output="output 1"))
dataset.add_ground_truth(GroundTruthExample(input="input 2", actual_output="output 2"))
random_name2 = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
client.push_dataset(alias=random_name2, dataset=dataset, overwrite=False)

all_datasets_stats = client.pull_all_user_dataset_stats()
print(all_datasets_stats)
assert all_datasets_stats, "Failed to pull dataset"
assert all_datasets_stats[random_name1]["example_count"] == 3, f"{random_name1} should have 3 examples"
assert all_datasets_stats[random_name1]["ground_truth_count"] == 0, f"{random_name1} should have 0 ground truths"
assert all_datasets_stats[random_name2]["example_count"] == 2, f"{random_name2} should have 2 examples"
assert all_datasets_stats[random_name2]["ground_truth_count"] == 2, f"{random_name2} should have 2 ground truths"

def test_run_eval(self, client: JudgmentClient):
"""Test basic evaluation workflow."""
# Single step in our workflow, an outreach Sales Agent
Expand Down Expand Up @@ -405,6 +430,7 @@ def run_selected_tests(client, test_names: list[str]):

test_map = {
'dataset': test_basic_operations.test_dataset,
'pull_all_user_dataset_stats': test_basic_operations.test_pull_all_user_dataset_stats,
'run_eval': test_basic_operations.test_run_eval,
'assert_test': test_basic_operations.test_assert_test,
'json_scorer': test_advanced_features.test_json_scorer,
Expand Down Expand Up @@ -433,11 +459,12 @@ def run_selected_tests(client, test_names: list[str]):

run_selected_tests(client, [
'dataset',
'run_eval',
'assert_test',
'json_scorer',
'override_eval',
'evaluate_dataset',
'classifier_scorer',
'custom_judge_vertexai'
'pull_all_user_dataset_stats',
# 'run_eval',
# 'assert_test',
# 'json_scorer',
# 'override_eval',
# 'evaluate_dataset',
# 'classifier_scorer',
# 'custom_judge_vertexai'
])
1 change: 1 addition & 0 deletions src/judgeval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def _missing_(cls, value):
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
Expand Down
3 changes: 2 additions & 1 deletion src/judgeval/data/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from judgeval.data.datasets.dataset import EvalDataset
from judgeval.data.datasets.ground_truth import GroundTruthExample
from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient

__all__ = ["EvalDataset", "GroundTruthExample"]
__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
123 changes: 1 addition & 122 deletions src/judgeval/data/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,11 @@
import csv
import datetime
import json
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
import requests
from dataclasses import dataclass, field
import os
from typing import List, Optional, Union, Literal

from judgeval.constants import JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_PULL_API_URL
from judgeval.data.datasets.ground_truth import GroundTruthExample
from judgeval.data.datasets.utils import ground_truths_to_examples, examples_to_ground_truths
from judgeval.data import Example
from judgeval.common.logger import debug, error, warning, info

Expand All @@ -37,120 +32,6 @@ def __init__(self,
self._id = None
self.judgment_api_key = judgment_api_key

def push(self, alias: str, overwrite: Optional[bool] = False) -> bool:
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
if overwrite:
warning(f"Overwrite enabled for alias '{alias}'")
"""
Pushes the dataset to Judgment platform

Mock request:
{
"alias": alias,
"ground_truths": [...],
"examples": [...],
"overwrite": overwrite
} ==>
{
"_alias": alias,
"_id": "..." # ID of the dataset
}
"""
with Progress(
SpinnerColumn(style="rgb(106,0,255)"),
TextColumn("[progress.description]{task.description}"),
transient=False,
) as progress:
task_id = progress.add_task(
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
total=100,
)
content = {
"alias": alias,
"ground_truths": [g.to_dict() for g in self.ground_truths],
"examples": [e.to_dict() for e in self.examples],
"overwrite": overwrite,
"judgment_api_key": self.judgment_api_key
}
try:
response = requests.post(
JUDGMENT_DATASETS_PUSH_API_URL,
json=content
)
if response.status_code == 500:
error(f"Server error during push: {content.get('message')}")
return False
response.raise_for_status()
except requests.exceptions.HTTPError as err:
if response.status_code == 422:
error(f"Validation error during push: {err.response.json()}")
else:
error(f"HTTP error during push: {err}")

info(f"Successfully pushed dataset with alias '{alias}'")
payload = response.json()
self._alias = payload.get("_alias")
self._id = payload.get("_id")
progress.update(
task_id,
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
)
return True

def pull(self, alias: str):
debug(f"Pulling dataset with alias '{alias}'")
"""
Pulls the dataset from Judgment platform

Mock request:
{
"alias": alias,
"user_id": user_id
}
==>
{
"ground_truths": [...],
"examples": [...],
"_alias": alias,
"_id": "..." # ID of the dataset
}
"""
# Make a POST request to the Judgment API to get the dataset

with Progress(
SpinnerColumn(style="rgb(106,0,255)"),
TextColumn("[progress.description]{task.description}"),
transient=False,
) as progress:
task_id = progress.add_task(
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
total=100,
)
request_body = {
"alias": alias,
"judgment_api_key": self.judgment_api_key
}

try:
response = requests.post(
JUDGMENT_DATASETS_PULL_API_URL,
json=request_body
)
response.raise_for_status()
except requests.exceptions.RequestException as e:
error(f"Error pulling dataset: {str(e)}")
raise

info(f"Successfully pulled dataset with alias '{alias}'")
payload = response.json()
self.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
self.examples = [Example(**e) for e in payload.get("examples", [])]
self._alias = payload.get("_alias")
self._id = payload.get("_id")
progress.update(
task_id,
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
)

def add_from_json(self, file_path: str) -> None:
debug(f"Loading dataset from JSON file: {file_path}")
Expand Down Expand Up @@ -402,6 +283,4 @@ def __str__(self):
f"_alias={self._alias}, "
f"_id={self._id}"
f")"
)


)
Loading