From 70d649fc908694c468ff9ed50c30d18a5812e0c1 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Tue, 14 Jan 2025 17:37:44 -0800 Subject: [PATCH 1/4] delete telemetry --- judgeval/common/telemetry.py | 123 ----------------------------------- judgeval/playground.py | 31 +++++---- judgeval/run_evaluation.py | 1 - judgeval/scorers/score.py | 61 +++++++++-------- 4 files changed, 45 insertions(+), 171 deletions(-) delete mode 100644 judgeval/common/telemetry.py diff --git a/judgeval/common/telemetry.py b/judgeval/common/telemetry.py deleted file mode 100644 index 22fd05db..00000000 --- a/judgeval/common/telemetry.py +++ /dev/null @@ -1,123 +0,0 @@ -from contextlib import contextmanager -import logging -import os -import socket -import sys -import uuid -import sentry_sdk -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( - OTLPSpanExporter, -) - - -def get_unique_id(): - unique_id = os.getenv("judgeval_UNIQUE_ID") - if unique_id is None: - unique_id = str(uuid.uuid4()) - os.environ["judgeval_UNIQUE_ID"] = unique_id - return unique_id - - -def telemetry_opt_out(): - return os.getenv("judgeval_TELEMETRY_OPT_OUT") == "YES" - - -def blocked_by_firewall(): - try: - socket.create_connection(("www.google.com", 80)) - return False - except OSError: - return True - - -if not telemetry_opt_out(): - sentry_sdk.init( - dsn="https://5ef587d58109ee45d6544f3657efdd1f@o4506098477236224.ingest.sentry.io/4506098479136768", - profiles_sample_rate=1.0, - traces_sample_rate=1.0, # For performance monitoring - send_default_pii=False, # Don't send personally identifiable information - attach_stacktrace=False, # Don't attach stack traces to messages - default_integrations=False, # Disable Sentry's default integrations - ) - - # Set up the Tracer Provider - trace.set_tracer_provider(TracerProvider()) - tracer_provider = trace.get_tracer_provider() - - # New Relic License Key and OTLP Endpoint - NEW_RELIC_LICENSE_KEY = "1711c684db8a30361a7edb0d0398772cFFFFNRAL" - NEW_RELIC_OTLP_ENDPOINT = "https://otlp.nr-data.net:4317" - otlp_exporter = OTLPSpanExporter( - endpoint=NEW_RELIC_OTLP_ENDPOINT, - headers={"api-key": NEW_RELIC_LICENSE_KEY}, - ) - - # Add the OTLP exporter to the span processor - span_processor = BatchSpanProcessor(otlp_exporter) - tracer_provider.add_span_processor(span_processor) - - logging.getLogger("opentelemetry.exporter.otlp").setLevel(logging.CRITICAL) - - # Create a tracer for your application - tracer = trace.get_tracer(__name__) - - -if ( - os.getenv("ERROR_REPORTING") == "YES" - and not blocked_by_firewall() - and not os.getenv("TELEMETRY_OPT_OUT") -): - - def handle_exception(exc_type, exc_value, exc_traceback): - print({"exc_type": exc_type, "exc_value": exc_value}) - sentry_sdk.capture_exception(exc_value) - sys.__excepthook__(exc_type, exc_value, exc_traceback) - - sys.excepthook = handle_exception - - -@contextmanager -def capture_evaluation_run(type: str): - if not telemetry_opt_out(): - with tracer.start_as_current_span(f"Evaluation run: {type}") as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_metric_type(metric_name: str, _track: bool = True): - if not telemetry_opt_out() and _track: - with tracer.start_as_current_span(metric_name) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_synthesizer_run(max_generations: int = None, method: str = None): - if not telemetry_opt_out() and max_generations is not None: - with tracer.start_as_current_span( - f"Invoked synthesizer ({max_generations}) | Method: {method}" - ) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_red_teamer_run(task: str): - if not telemetry_opt_out(): - with tracer.start_as_current_span( - f"Invoked red teamer: ({task})" - ) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield diff --git a/judgeval/playground.py b/judgeval/playground.py index c5d065c6..36eab39e 100644 --- a/judgeval/playground.py +++ b/judgeval/playground.py @@ -15,7 +15,6 @@ from judgeval.judges.utils import create_judge from judgeval.scorers.custom_scorer import CustomScorer from judgeval.scorers.score import * -from judgeval.common.telemetry import capture_metric_type """ Testing implementation of CustomFaithfulness @@ -195,22 +194,22 @@ def metric_progress_indicator( total: int = 9999, transient: bool = True, ): - with capture_metric_type(metric.__name__): - console = Console(file=sys.stderr) # Direct output to standard error - if _show_indicator: - with Progress( - SpinnerColumn(style="rgb(106,0,255)"), - TextColumn("[progress.description]{task.description}"), - console=console, # Use the custom console - transient=transient, - ) as progress: - progress.add_task( - description=scorer_console_msg(metric, async_mode), - total=total, - ) - yield - else: + + console = Console(file=sys.stderr) # Direct output to standard error + if _show_indicator: + with Progress( + SpinnerColumn(style="rgb(106,0,255)"), + TextColumn("[progress.description]{task.description}"), + console=console, # Use the custom console + transient=transient, + ) as progress: + progress.add_task( + description=scorer_console_msg(metric, async_mode), + total=total, + ) yield + else: + yield def prettify_list(lst: List[Any]): diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py index c32676cf..7a18b373 100644 --- a/judgeval/run_evaluation.py +++ b/judgeval/run_evaluation.py @@ -265,7 +265,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor else: custom_scorers.append(scorer) debug(f"Added custom scorer: {type(scorer).__name__}") - debug(f"Found {len(judgment_scorers)} judgment scorers and {len(custom_scorers)} custom scorers") api_results: List[ScoringResult] = [] diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py index b16352e8..c3a790d0 100644 --- a/judgeval/scorers/score.py +++ b/judgeval/scorers/score.py @@ -18,7 +18,6 @@ ) from judgeval.scorers import CustomScorer from judgeval.scorers.utils import clone_scorers, scorer_console_msg -from judgeval.common.telemetry import capture_evaluation_run from judgeval.common.exceptions import MissingTestCaseParamsError from judgeval.common.logger import example_logging_context, debug, error, warning, info from judgeval.judges import judgevalJudge @@ -312,36 +311,10 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs): debug(f"Scorer threshold: {scorer.threshold}") if hasattr(scorer, 'model'): debug(f"Scorer model: {type(scorer.model).__name__}") - with capture_evaluation_run("Example"): - if isinstance(ex, Example): - if len(scorers) == 0: - pbar.update(1) - continue - - cloned_scorers: List[CustomScorer] = clone_scorers( - scorers - ) - task = execute_with_semaphore( - func=a_eval_examples_helper, - scorers=cloned_scorers, - example=ex, - scoring_results=scoring_results, - score_index=i, - ignore_errors=ignore_errors, - skip_on_missing_params=skip_on_missing_params, - show_indicator=show_indicator, - _use_bar_indicator=_use_bar_indicator, - pbar=pbar, - ) - tasks.append(asyncio.create_task(task)) - - await asyncio.sleep(throttle_value) - await asyncio.gather(*tasks) - else: - for i, ex in enumerate(examples): - with capture_evaluation_run("Example"): + if isinstance(ex, Example): if len(scorers) == 0: + pbar.update(1) continue cloned_scorers: List[CustomScorer] = clone_scorers( @@ -355,12 +328,38 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs): score_index=i, ignore_errors=ignore_errors, skip_on_missing_params=skip_on_missing_params, - _use_bar_indicator=_use_bar_indicator, show_indicator=show_indicator, + _use_bar_indicator=_use_bar_indicator, + pbar=pbar, ) - tasks.append(asyncio.create_task((task))) + tasks.append(asyncio.create_task(task)) await asyncio.sleep(throttle_value) + await asyncio.gather(*tasks) + else: + for i, ex in enumerate(examples): + + if isinstance(ex, Example): + if len(scorers) == 0: + continue + + cloned_scorers: List[CustomScorer] = clone_scorers( + scorers + ) + task = execute_with_semaphore( + func=a_eval_examples_helper, + scorers=cloned_scorers, + example=ex, + scoring_results=scoring_results, + score_index=i, + ignore_errors=ignore_errors, + skip_on_missing_params=skip_on_missing_params, + _use_bar_indicator=_use_bar_indicator, + show_indicator=show_indicator, + ) + tasks.append(asyncio.create_task((task))) + + await asyncio.sleep(throttle_value) await asyncio.gather(*tasks) return scoring_results From 101eb355bc27a667e316482ccab0834f50031640 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Tue, 14 Jan 2025 17:38:13 -0800 Subject: [PATCH 2/4] Add custom scorer test for Vertex AI --- e2etests/judgment_client_test.py | 56 ++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py index 4eb7484b..764173a3 100644 --- a/e2etests/judgment_client_test.py +++ b/e2etests/judgment_client_test.py @@ -9,7 +9,7 @@ FaithfulnessScorer, HallucinationScorer, ) -from judgeval.judges import TogetherJudge +from judgeval.judges import TogetherJudge, judgevalJudge from judgeval.playground import CustomFaithfulnessMetric from judgeval.data.datasets.dataset import EvalDataset from dotenv import load_dotenv @@ -76,6 +76,7 @@ def test_run_eval(client: JudgmentClient): results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME) print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results) + def test_override_eval(client: JudgmentClient): example1 = Example( input="What if these shoes don't fit?", @@ -147,7 +148,6 @@ def test_override_eval(client: JudgmentClient): raise print(f"Successfully caught expected error: {e}") - def test_evaluate_dataset(client: JudgmentClient): @@ -180,6 +180,7 @@ def test_evaluate_dataset(client: JudgmentClient): print(res) + def test_classifier_scorer(client: JudgmentClient): classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl") faithfulness_scorer = FaithfulnessScorer(threshold=0.5) @@ -197,6 +198,57 @@ def test_classifier_scorer(client: JudgmentClient): ) print(res) + +def test_custom_judge(client: JudgmentClient): + + import vertexai + from vertexai.generative_models import GenerativeModel + + PROJECT_ID = "judgment-labs" + vertexai.init(project=PROJECT_ID, location="us-west1") + + class VertexAIJudge(judgevalJudge): + + def __init__(self, model_name: str = "gemini-1.5-flash-002"): + self.model_name = model_name + self.model = GenerativeModel(self.model_name) + + def load_model(self): + return self.model + + def generate(self, prompt) -> str: + # prompt is a List[dict] (conversation history) + # For models that don't support conversation history, we need to convert to string + # If you're using a model that supports chat history, you can just pass the prompt directly + response = self.model.generate_content(str(prompt)) + return response.text + + async def a_generate(self, prompt) -> str: + # prompt is a List[dict] (conversation history) + # For models that don't support conversation history, we need to convert to string + # If you're using a model that supports chat history, you can just pass the prompt directly + response = await self.model.generate_content_async(str(prompt)) + return response.text + + def get_model_name(self) -> str: + return self.model_name + + example = Example( + input="What is the largest animal in the world?", + actual_output="The blue whale is the largest known animal.", + retrieval_context=["The blue whale is the largest known animal."], + ) + + judge = VertexAIJudge() + + res = client.run_evaluation( + examples=[example], + scorers=[CustomFaithfulnessMetric()], + model=judge, + ) + print(res) + + if __name__ == "__main__": # Test client functionality client = get_client() From d0eeedc186b0a5f69b5fa79ca8fffbd7e0bb6f75 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Tue, 14 Jan 2025 17:38:33 -0800 Subject: [PATCH 3/4] Add custom judge test --- judgeval/evaluation_run.py | 32 ++++++++++++++++++++++++-------- judgeval/judgment_client.py | 3 ++- judgeval/scorers/score.py | 6 ++++-- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/judgeval/evaluation_run.py b/judgeval/evaluation_run.py index a731c581..81dcc9c8 100644 --- a/judgeval/evaluation_run.py +++ b/judgeval/evaluation_run.py @@ -2,10 +2,11 @@ from pydantic import BaseModel, field_validator from judgeval.data import Example -from judgeval.data.datasets import EvalDataset from judgeval.scorers import CustomScorer, JudgmentScorer from judgeval.constants import ACCEPTABLE_MODELS from judgeval.common.logger import debug, error +from judgeval.judges import judgevalJudge + class EvaluationRun(BaseModel): """ Stores example and evaluation scorers together for running an eval task @@ -27,7 +28,7 @@ class EvaluationRun(BaseModel): eval_name: Optional[str] = None examples: List[Example] scorers: List[Union[JudgmentScorer, CustomScorer]] - model: Union[str, List[str]] + model: Union[str, List[str], judgevalJudge] aggregator: Optional[str] = None metadata: Optional[Dict[str, Any]] = None # API Key will be "" until user calls client.run_eval(), then API Key will be set @@ -74,18 +75,33 @@ def validate_scorers(cls, v): return v @field_validator('model') - def validate_model(cls, v): + def validate_model(cls, v, values): if not v: raise ValueError("Model cannot be empty.") - if not isinstance(v, str) and not isinstance(v, list): - raise ValueError("Model must be a string or a list of strings.") - if isinstance(v, str) and v not in ACCEPTABLE_MODELS: - raise ValueError(f"Model name {v} not recognized.") + + # Check if model is a judgevalJudge + if isinstance(v, judgevalJudge): + # Verify all scorers are CustomScorer when using judgevalJudge + scorers = values.data.get('scorers', []) + if not all(isinstance(s, CustomScorer) for s in scorers): + raise ValueError("When using a judgevalJudge model, all scorers must be CustomScorer type") + return v + + # Check if model is string or list of strings + if isinstance(v, str): + if v not in ACCEPTABLE_MODELS: + raise ValueError(f"Model name {v} not recognized.") + return v + if isinstance(v, list): + if not all(isinstance(m, str) for m in v): + raise ValueError("When providing a list of models, all elements must be strings") for m in v: if m not in ACCEPTABLE_MODELS: raise ValueError(f"Model name {m} not recognized.") - return v + return v + + raise ValueError("Model must be one of: string, list of strings, or judgevalJudge instance") @field_validator('aggregator', mode='before') def validate_aggregator(cls, v, values): diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py index fe4636a1..42c3dfcd 100644 --- a/judgeval/judgment_client.py +++ b/judgeval/judgment_client.py @@ -11,6 +11,7 @@ from judgeval.scorers import JudgmentScorer, CustomScorer, ClassifierScorer from judgeval.evaluation_run import EvaluationRun from judgeval.run_evaluation import run_eval +from judgeval.judges import judgevalJudge from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL from judgeval.common.exceptions import JudgmentAPIError from pydantic import BaseModel @@ -38,7 +39,7 @@ def run_evaluation( self, examples: List[Example], scorers: List[Union[JudgmentScorer, CustomScorer]], - model: Union[str, List[str]], + model: Union[str, List[str], judgevalJudge], aggregator: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, log_results: bool = False, diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py index c3a790d0..17de15b1 100644 --- a/judgeval/scorers/score.py +++ b/judgeval/scorers/score.py @@ -256,8 +256,10 @@ async def a_execute_scoring( Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s. Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list. + Args: examples (List[Example]): A list of `Example` objects to be evaluated. - scorers (List[CustomScorer]): A list of `CustomScorer` objects to evaluate the examples. + scorers (List[CustomScorer]): A list of `CustomScorer` objects to evaluate the examples.\ + model (Union[str, List[str], judgevalJudge]): The model to use for evaluation. ignore_errors (bool): Whether to ignore errors during evaluation. skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing. show_indicator (bool): Whether to show a progress indicator. @@ -267,7 +269,7 @@ async def a_execute_scoring( _use_bar_indicator (bool): Whether to use a progress bar indicator. Returns: - List[TestResult]: A list of `TestResult` objects containing the evaluation results. + List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results. """ semaphore = asyncio.Semaphore(max_concurrent) From ffdf7eb94b8ad74542e63e91c4a62df22a01a25c Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Tue, 14 Jan 2025 17:40:07 -0800 Subject: [PATCH 4/4] prettify test file with test for custom judge model (vertexai) --- e2etests/judgment_client_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py index 764173a3..285ca21c 100644 --- a/e2etests/judgment_client_test.py +++ b/e2etests/judgment_client_test.py @@ -199,7 +199,7 @@ def test_classifier_scorer(client: JudgmentClient): print(res) -def test_custom_judge(client: JudgmentClient): +def test_custom_judge_vertexai(client: JudgmentClient): import vertexai from vertexai.generative_models import GenerativeModel @@ -281,4 +281,9 @@ def get_model_name(self) -> str: print("Classifier scorer test successful") print("*" * 40) + print("Testing custom judge") + test_custom_judge_vertexai(ui_client) + print("Custom judge test successful") + print("*" * 40) + print("All tests passed successfully")