From 70d649fc908694c468ff9ed50c30d18a5812e0c1 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Tue, 14 Jan 2025 17:37:44 -0800
Subject: [PATCH 1/4] delete telemetry

---
 judgeval/common/telemetry.py | 123 -----------------------------------
 judgeval/playground.py       |  31 +++++----
 judgeval/run_evaluation.py   |   1 -
 judgeval/scorers/score.py    |  61 +++++++++--------
 4 files changed, 45 insertions(+), 171 deletions(-)
 delete mode 100644 judgeval/common/telemetry.py

diff --git a/judgeval/common/telemetry.py b/judgeval/common/telemetry.py
deleted file mode 100644
index 22fd05db..00000000
--- a/judgeval/common/telemetry.py
+++ /dev/null
@@ -1,123 +0,0 @@
-from contextlib import contextmanager
-import logging
-import os
-import socket
-import sys
-import uuid
-import sentry_sdk
-from opentelemetry import trace
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
-    OTLPSpanExporter,
-)
-
-
-def get_unique_id():
-    unique_id = os.getenv("judgeval_UNIQUE_ID")
-    if unique_id is None:
-        unique_id = str(uuid.uuid4())
-        os.environ["judgeval_UNIQUE_ID"] = unique_id
-    return unique_id
-
-
-def telemetry_opt_out():
-    return os.getenv("judgeval_TELEMETRY_OPT_OUT") == "YES"
-
-
-def blocked_by_firewall():
-    try:
-        socket.create_connection(("www.google.com", 80))
-        return False
-    except OSError:
-        return True
-
-
-if not telemetry_opt_out():
-    sentry_sdk.init(
-        dsn="https://5ef587d58109ee45d6544f3657efdd1f@o4506098477236224.ingest.sentry.io/4506098479136768",
-        profiles_sample_rate=1.0,
-        traces_sample_rate=1.0,  # For performance monitoring
-        send_default_pii=False,  # Don't send personally identifiable information
-        attach_stacktrace=False,  # Don't attach stack traces to messages
-        default_integrations=False,  # Disable Sentry's default integrations
-    )
-
-    # Set up the Tracer Provider
-    trace.set_tracer_provider(TracerProvider())
-    tracer_provider = trace.get_tracer_provider()
-
-    # New Relic License Key and OTLP Endpoint
-    NEW_RELIC_LICENSE_KEY = "1711c684db8a30361a7edb0d0398772cFFFFNRAL"
-    NEW_RELIC_OTLP_ENDPOINT = "https://otlp.nr-data.net:4317"
-    otlp_exporter = OTLPSpanExporter(
-        endpoint=NEW_RELIC_OTLP_ENDPOINT,
-        headers={"api-key": NEW_RELIC_LICENSE_KEY},
-    )
-
-    # Add the OTLP exporter to the span processor
-    span_processor = BatchSpanProcessor(otlp_exporter)
-    tracer_provider.add_span_processor(span_processor)
-
-    logging.getLogger("opentelemetry.exporter.otlp").setLevel(logging.CRITICAL)
-
-    # Create a tracer for your application
-    tracer = trace.get_tracer(__name__)
-
-
-if (
-    os.getenv("ERROR_REPORTING") == "YES"
-    and not blocked_by_firewall()
-    and not os.getenv("TELEMETRY_OPT_OUT")
-):
-
-    def handle_exception(exc_type, exc_value, exc_traceback):
-        print({"exc_type": exc_type, "exc_value": exc_value})
-        sentry_sdk.capture_exception(exc_value)
-        sys.__excepthook__(exc_type, exc_value, exc_traceback)
-
-    sys.excepthook = handle_exception
-
-
-@contextmanager
-def capture_evaluation_run(type: str):
-    if not telemetry_opt_out():
-        with tracer.start_as_current_span(f"Evaluation run: {type}") as span:
-            span.set_attribute("user.unique_id", get_unique_id())
-            yield span
-    else:
-        yield
-
-
-@contextmanager
-def capture_metric_type(metric_name: str, _track: bool = True):
-    if not telemetry_opt_out() and _track:
-        with tracer.start_as_current_span(metric_name) as span:
-            span.set_attribute("user.unique_id", get_unique_id())
-            yield span
-    else:
-        yield
-
-
-@contextmanager
-def capture_synthesizer_run(max_generations: int = None, method: str = None):
-    if not telemetry_opt_out() and max_generations is not None:
-        with tracer.start_as_current_span(
-            f"Invoked synthesizer ({max_generations}) | Method: {method}"
-        ) as span:
-            span.set_attribute("user.unique_id", get_unique_id())
-            yield span
-    else:
-        yield
-
-
-@contextmanager
-def capture_red_teamer_run(task: str):
-    if not telemetry_opt_out():
-        with tracer.start_as_current_span(
-            f"Invoked red teamer: ({task})"
-        ) as span:
-            span.set_attribute("user.unique_id", get_unique_id())
-            yield span
-    else:
-        yield
diff --git a/judgeval/playground.py b/judgeval/playground.py
index c5d065c6..36eab39e 100644
--- a/judgeval/playground.py
+++ b/judgeval/playground.py
@@ -15,7 +15,6 @@
 from judgeval.judges.utils import create_judge
 from judgeval.scorers.custom_scorer import CustomScorer
 from judgeval.scorers.score import *
-from judgeval.common.telemetry import capture_metric_type
 
 """
 Testing implementation of CustomFaithfulness
@@ -195,22 +194,22 @@ def metric_progress_indicator(
     total: int = 9999,
     transient: bool = True,
 ):
-    with capture_metric_type(metric.__name__):
-        console = Console(file=sys.stderr)  # Direct output to standard error
-        if _show_indicator:
-            with Progress(
-                SpinnerColumn(style="rgb(106,0,255)"),
-                TextColumn("[progress.description]{task.description}"),
-                console=console,  # Use the custom console
-                transient=transient,
-            ) as progress:
-                progress.add_task(
-                    description=scorer_console_msg(metric, async_mode),
-                    total=total,
-                )
-                yield
-        else:
+
+    console = Console(file=sys.stderr)  # Direct output to standard error
+    if _show_indicator:
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            console=console,  # Use the custom console
+            transient=transient,
+        ) as progress:
+            progress.add_task(
+                description=scorer_console_msg(metric, async_mode),
+                total=total,
+            )
             yield
+    else:
+        yield
 
 
 def prettify_list(lst: List[Any]):
diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py
index c32676cf..7a18b373 100644
--- a/judgeval/run_evaluation.py
+++ b/judgeval/run_evaluation.py
@@ -265,7 +265,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
         else:
             custom_scorers.append(scorer)
             debug(f"Added custom scorer: {type(scorer).__name__}")
-    
     debug(f"Found {len(judgment_scorers)} judgment scorers and {len(custom_scorers)} custom scorers")
     
     api_results: List[ScoringResult] = []
diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py
index b16352e8..c3a790d0 100644
--- a/judgeval/scorers/score.py
+++ b/judgeval/scorers/score.py
@@ -18,7 +18,6 @@
 )
 from judgeval.scorers import CustomScorer
 from judgeval.scorers.utils import clone_scorers, scorer_console_msg
-from judgeval.common.telemetry import capture_evaluation_run
 from judgeval.common.exceptions import MissingTestCaseParamsError
 from judgeval.common.logger import example_logging_context, debug, error, warning, info
 from judgeval.judges import judgevalJudge
@@ -312,36 +311,10 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs):
                             debug(f"Scorer threshold: {scorer.threshold}")
                         if hasattr(scorer, 'model'):
                             debug(f"Scorer model: {type(scorer.model).__name__}")
-                with capture_evaluation_run("Example"):
-                    if isinstance(ex, Example):
-                        if len(scorers) == 0:
-                            pbar.update(1)
-                            continue
-
-                        cloned_scorers: List[CustomScorer] = clone_scorers(
-                            scorers
-                        )
-                        task = execute_with_semaphore(
-                            func=a_eval_examples_helper,
-                            scorers=cloned_scorers,
-                            example=ex,
-                            scoring_results=scoring_results,
-                            score_index=i,
-                            ignore_errors=ignore_errors,
-                            skip_on_missing_params=skip_on_missing_params,
-                            show_indicator=show_indicator,
-                            _use_bar_indicator=_use_bar_indicator,
-                            pbar=pbar,
-                        )
-                        tasks.append(asyncio.create_task(task))
-
-                    await asyncio.sleep(throttle_value)
-            await asyncio.gather(*tasks)
-    else:
-        for i, ex in enumerate(examples):
-            with capture_evaluation_run("Example"):
+
                 if isinstance(ex, Example):
                     if len(scorers) == 0:
+                        pbar.update(1)
                         continue
 
                     cloned_scorers: List[CustomScorer] = clone_scorers(
@@ -355,12 +328,38 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs):
                         score_index=i,
                         ignore_errors=ignore_errors,
                         skip_on_missing_params=skip_on_missing_params,
-                        _use_bar_indicator=_use_bar_indicator,
                         show_indicator=show_indicator,
+                        _use_bar_indicator=_use_bar_indicator,
+                        pbar=pbar,
                     )
-                    tasks.append(asyncio.create_task((task)))
+                    tasks.append(asyncio.create_task(task))
 
                 await asyncio.sleep(throttle_value)
+            await asyncio.gather(*tasks)
+    else:
+        for i, ex in enumerate(examples):
+
+            if isinstance(ex, Example):
+                if len(scorers) == 0:
+                    continue
+
+                cloned_scorers: List[CustomScorer] = clone_scorers(
+                    scorers
+                )
+                task = execute_with_semaphore(
+                    func=a_eval_examples_helper,
+                    scorers=cloned_scorers,
+                    example=ex,
+                    scoring_results=scoring_results,
+                    score_index=i,
+                    ignore_errors=ignore_errors,
+                    skip_on_missing_params=skip_on_missing_params,
+                    _use_bar_indicator=_use_bar_indicator,
+                    show_indicator=show_indicator,
+                )
+                tasks.append(asyncio.create_task((task)))
+
+            await asyncio.sleep(throttle_value)
         await asyncio.gather(*tasks)
     return scoring_results
 

From 101eb355bc27a667e316482ccab0834f50031640 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Tue, 14 Jan 2025 17:38:13 -0800
Subject: [PATCH 2/4] Add custom scorer test for Vertex AI

---
 e2etests/judgment_client_test.py | 56 ++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py
index 4eb7484b..764173a3 100644
--- a/e2etests/judgment_client_test.py
+++ b/e2etests/judgment_client_test.py
@@ -9,7 +9,7 @@
     FaithfulnessScorer,
     HallucinationScorer,
 )
-from judgeval.judges import TogetherJudge
+from judgeval.judges import TogetherJudge, judgevalJudge
 from judgeval.playground import CustomFaithfulnessMetric
 from judgeval.data.datasets.dataset import EvalDataset
 from dotenv import load_dotenv
@@ -76,6 +76,7 @@ def test_run_eval(client: JudgmentClient):
     results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
     print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
 
+
 def test_override_eval(client: JudgmentClient):
     example1 = Example(
         input="What if these shoes don't fit?",
@@ -147,7 +148,6 @@ def test_override_eval(client: JudgmentClient):
             raise
         print(f"Successfully caught expected error: {e}")
     
-    
 
 def test_evaluate_dataset(client: JudgmentClient):
 
@@ -180,6 +180,7 @@ def test_evaluate_dataset(client: JudgmentClient):
 
     print(res)
     
+
 def test_classifier_scorer(client: JudgmentClient):
     classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
     faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
@@ -197,6 +198,57 @@ def test_classifier_scorer(client: JudgmentClient):
     )
     print(res)
 
+
+def test_custom_judge(client: JudgmentClient):
+    
+    import vertexai
+    from vertexai.generative_models import GenerativeModel
+
+    PROJECT_ID = "judgment-labs"
+    vertexai.init(project=PROJECT_ID, location="us-west1")
+    
+    class VertexAIJudge(judgevalJudge):
+
+        def __init__(self, model_name: str = "gemini-1.5-flash-002"):
+            self.model_name = model_name
+            self.model = GenerativeModel(self.model_name)
+
+        def load_model(self):
+            return self.model
+
+        def generate(self, prompt) -> str:
+            # prompt is a List[dict] (conversation history)
+            # For models that don't support conversation history, we need to convert to string
+            # If you're using a model that supports chat history, you can just pass the prompt directly
+            response = self.model.generate_content(str(prompt))
+            return response.text
+        
+        async def a_generate(self, prompt) -> str:
+            # prompt is a List[dict] (conversation history)
+            # For models that don't support conversation history, we need to convert to string
+            # If you're using a model that supports chat history, you can just pass the prompt directly
+            response = await self.model.generate_content_async(str(prompt))
+            return response.text
+        
+        def get_model_name(self) -> str:
+            return self.model_name
+        
+    example = Example(
+        input="What is the largest animal in the world?",
+        actual_output="The blue whale is the largest known animal.",
+        retrieval_context=["The blue whale is the largest known animal."],
+    )
+
+    judge = VertexAIJudge()
+
+    res = client.run_evaluation(
+        examples=[example],
+        scorers=[CustomFaithfulnessMetric()],
+        model=judge,
+    )
+    print(res)
+
+
 if __name__ == "__main__":
     # Test client functionality
     client = get_client()

From d0eeedc186b0a5f69b5fa79ca8fffbd7e0bb6f75 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Tue, 14 Jan 2025 17:38:33 -0800
Subject: [PATCH 3/4] Add custom judge test

---
 judgeval/evaluation_run.py  | 32 ++++++++++++++++++++++++--------
 judgeval/judgment_client.py |  3 ++-
 judgeval/scorers/score.py   |  6 ++++--
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/judgeval/evaluation_run.py b/judgeval/evaluation_run.py
index a731c581..81dcc9c8 100644
--- a/judgeval/evaluation_run.py
+++ b/judgeval/evaluation_run.py
@@ -2,10 +2,11 @@
 from pydantic import BaseModel, field_validator
 
 from judgeval.data import Example
-from judgeval.data.datasets import EvalDataset
 from judgeval.scorers import CustomScorer, JudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.common.logger import debug, error
+from judgeval.judges import judgevalJudge
+
 class EvaluationRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
@@ -27,7 +28,7 @@ class EvaluationRun(BaseModel):
     eval_name: Optional[str] = None
     examples: List[Example]
     scorers: List[Union[JudgmentScorer, CustomScorer]]
-    model: Union[str, List[str]]
+    model: Union[str, List[str], judgevalJudge]
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -74,18 +75,33 @@ def validate_scorers(cls, v):
         return v
 
     @field_validator('model')
-    def validate_model(cls, v):
+    def validate_model(cls, v, values):
         if not v:
             raise ValueError("Model cannot be empty.")
-        if not isinstance(v, str) and not isinstance(v, list):
-            raise ValueError("Model must be a string or a list of strings.")
-        if isinstance(v, str) and v not in ACCEPTABLE_MODELS:
-            raise ValueError(f"Model name {v} not recognized.")
+        
+        # Check if model is a judgevalJudge
+        if isinstance(v, judgevalJudge):
+            # Verify all scorers are CustomScorer when using judgevalJudge
+            scorers = values.data.get('scorers', [])
+            if not all(isinstance(s, CustomScorer) for s in scorers):
+                raise ValueError("When using a judgevalJudge model, all scorers must be CustomScorer type")
+            return v
+            
+        # Check if model is string or list of strings
+        if isinstance(v, str):
+            if v not in ACCEPTABLE_MODELS:
+                raise ValueError(f"Model name {v} not recognized.")
+            return v
+            
         if isinstance(v, list):
+            if not all(isinstance(m, str) for m in v):
+                raise ValueError("When providing a list of models, all elements must be strings")
             for m in v:
                 if m not in ACCEPTABLE_MODELS:
                     raise ValueError(f"Model name {m} not recognized.")
-        return v
+            return v
+            
+        raise ValueError("Model must be one of: string, list of strings, or judgevalJudge instance")
 
     @field_validator('aggregator', mode='before')
     def validate_aggregator(cls, v, values):
diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py
index fe4636a1..42c3dfcd 100644
--- a/judgeval/judgment_client.py
+++ b/judgeval/judgment_client.py
@@ -11,6 +11,7 @@
 from judgeval.scorers import JudgmentScorer, CustomScorer, ClassifierScorer
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import run_eval
+from judgeval.judges import judgevalJudge
 from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -38,7 +39,7 @@ def run_evaluation(
         self, 
         examples: List[Example],
         scorers: List[Union[JudgmentScorer, CustomScorer]],
-        model: Union[str, List[str]],
+        model: Union[str, List[str], judgevalJudge],
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = False,
diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py
index c3a790d0..17de15b1 100644
--- a/judgeval/scorers/score.py
+++ b/judgeval/scorers/score.py
@@ -256,8 +256,10 @@ async def a_execute_scoring(
     Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
     Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
 
+    Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
-        scorers (List[CustomScorer]): A list of `CustomScorer` objects to evaluate the examples.
+        scorers (List[CustomScorer]): A list of `CustomScorer` objects to evaluate the examples.\
+        model (Union[str, List[str], judgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
         skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
         show_indicator (bool): Whether to show a progress indicator.
@@ -267,7 +269,7 @@ async def a_execute_scoring(
         _use_bar_indicator (bool): Whether to use a progress bar indicator.
 
     Returns:
-        List[TestResult]: A list of `TestResult` objects containing the evaluation results.
+        List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
     """
     semaphore = asyncio.Semaphore(max_concurrent)
 

From ffdf7eb94b8ad74542e63e91c4a62df22a01a25c Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Tue, 14 Jan 2025 17:40:07 -0800
Subject: [PATCH 4/4] prettify test file with test for custom judge model
 (vertexai)

---
 e2etests/judgment_client_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py
index 764173a3..285ca21c 100644
--- a/e2etests/judgment_client_test.py
+++ b/e2etests/judgment_client_test.py
@@ -199,7 +199,7 @@ def test_classifier_scorer(client: JudgmentClient):
     print(res)
 
 
-def test_custom_judge(client: JudgmentClient):
+def test_custom_judge_vertexai(client: JudgmentClient):
     
     import vertexai
     from vertexai.generative_models import GenerativeModel
@@ -281,4 +281,9 @@ def get_model_name(self) -> str:
     print("Classifier scorer test successful")
     print("*" * 40)
 
+    print("Testing custom judge")
+    test_custom_judge_vertexai(ui_client)
+    print("Custom judge test successful")
+    print("*" * 40)
+
     print("All tests passed successfully")