JudgmentLabs
diff --git a/‎pyproject.toml
Lines changed: 5 additions & 0 deletions b/‎pyproject.toml
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/e2etests/test_all_scorers.py
Lines changed: 0 additions & 141 deletions b/‎src/e2etests/test_all_scorers.py
Lines changed: 0 additions & 141 deletions
@@ -29,12 +29,17 @@ dependencies = [
     "langchain-openai",
     "langchain-anthropic",
     "langchain-core",
+    "click<8.2.0",
+    "typer>=0.9.0",
 ]
 
 [project.urls]
 Homepage = "https://github.com/JudgmentLabs/judgeval"
 Issues = "https://github.com/JudgmentLabs/judgeval/issues"
 
+[project.scripts]
+judgeval = "judgeval.cli:app"
+
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
@@ -9,9 +9,7 @@
     FaithfulnessScorer,
     InstructionAdherenceScorer,
     ExecutionOrderScorer,
-    PromptScorer,
 )
-from uuid import uuid4
 from judgeval.data import Example
 from judgeval.constants import DEFAULT_TOGETHER_MODEL
 
@@ -32,7 +30,6 @@ def test_ac_scorer(client: JudgmentClient, project_name: str):
         model=DEFAULT_TOGETHER_MODEL,
         project_name=project_name,
         eval_run_name=EVAL_RUN_NAME,
-        override=True,
     )
     print_debug_on_failure(res[0])
 
@@ -58,7 +55,6 @@ def test_ar_scorer(client: JudgmentClient, project_name: str):
         model=DEFAULT_TOGETHER_MODEL,
         project_name=project_name,
         eval_run_name=EVAL_RUN_NAME,
-        override=True,
     )
 
     print_debug_on_failure(res[0])
@@ -101,7 +97,6 @@ def test_faithfulness_scorer(client: JudgmentClient, project_name: str):
         model=DEFAULT_TOGETHER_MODEL,
         project_name=project_name,
         eval_run_name=EVAL_RUN_NAME,
-        override=True,
     )
 
     print_debug_on_failure(res[0])
@@ -127,7 +122,6 @@ def test_instruction_adherence_scorer(client: JudgmentClient, project_name: str)
         model=DEFAULT_TOGETHER_MODEL,
         project_name=project_name,
         eval_run_name=EVAL_RUN_NAME,
-        override=True,
     )
 
     print_debug_on_failure(res[0])
@@ -160,146 +154,11 @@ def test_execution_order_scorer(client: JudgmentClient, project_name: str):
         model=DEFAULT_TOGETHER_MODEL,
         project_name=project_name,
         eval_run_name=EVAL_RUN_NAME,
-        override=True,
     )
 
     assert not res[0].success
 
 
-def test_prompt_scorer_without_options(client: JudgmentClient, project_name: str):
-    """Test prompt scorer functionality."""
-
-    prompt_scorer = PromptScorer.create(
-        name=f"Test Prompt Scorer Without Options {uuid4()}",
-        prompt="Question: {{input}}\nResponse: {{actual_output}}\n\nIs this response relevant to the question?",
-    )
-
-    relevant_example = Example(
-        input="What's the weather in New York?",
-        actual_output="The weather in New York is sunny.",
-    )
-
-    irrelevant_example = Example(
-        input="What's the capital of France?",
-        actual_output="The mitochondria is the powerhouse of the cell, and did you know that honey never spoils?",
-    )
-
-    # Run evaluation
-    res = client.run_evaluation(
-        examples=[relevant_example, irrelevant_example],
-        scorers=[prompt_scorer],
-        model=DEFAULT_TOGETHER_MODEL,
-        project_name=project_name,
-        eval_run_name="test-run-prompt-scorer-without-options",
-        override=True,
-    )
-
-    # Verify results
-    assert res[0].success, "Relevant example should pass classification"
-    assert not res[1].success, "Irrelevant example should fail classification"
-
-    print_debug_on_failure(res[0])
-    print_debug_on_failure(res[1])
-
-
-def test_prompt_scorer_with_options(client: JudgmentClient, project_name: str):
-    """Test prompt scorer functionality."""
-    # Creating a prompt scorer from SDK
-    prompt_scorer = PromptScorer.create(
-        name=f"Test Prompt Scorer {uuid4()}",
-        prompt="Question: {{input}}\nResponse: {{actual_output}}\n\nIs this response helpful?",
-        options={"yes": 1.0, "no": 0.0},
-    )
-
-    # Update the options with helpfulness classification choices
-    prompt_scorer.set_options(
-        {
-            "yes": 1.0,  # Helpful response
-            "no": 0.0,  # Unhelpful response
-        }
-    )
-
-    # Create test examples
-    helpful_example = Example(
-        input="What's the capital of France?",
-        actual_output="The capital of France is Paris.",
-    )
-
-    unhelpful_example = Example(
-        input="What's the capital of France?",
-        actual_output="I don't know much about geography, but I think it might be somewhere in Europe.",
-    )
-
-    # Run evaluation
-    res = client.run_evaluation(
-        examples=[helpful_example, unhelpful_example],
-        scorers=[prompt_scorer],
-        model=DEFAULT_TOGETHER_MODEL,
-        project_name=project_name,
-        eval_run_name="test-run-prompt-scorer-with-options",
-        override=True,
-    )
-
-    # Verify results
-    assert res[0].success, "Helpful example should pass classification"
-    assert not res[1].success, "Unhelpful example should fail classification"
-
-    # Print debug info if any test fails
-    print_debug_on_failure(res[0])
-    print_debug_on_failure(res[1])
-
-
-def test_custom_prompt_scorer(client: JudgmentClient, project_name: str):
-    """Test custom prompt scorer functionality."""
-    # Creating a custom prompt scorer from SDK
-    # Creating a prompt scorer from SDK
-    prompt_scorer = PromptScorer.create(
-        name=f"Test Prompt Scorer {uuid4()}",
-        prompt="Comparison A: {{comparison_a}}\n Comparison B: {{comparison_b}}\n\n Which candidate is better for a teammate?",
-        options={"comparison_a": 1.0, "comparison_b": 0.0},
-    )
-
-    prompt_scorer.set_options(
-        {
-            "comparison_a": 1.0,
-            "comparison_b": 0.0,
-        }
-    )
-
-    class ComparisonExample(Example):
-        comparison_a: str
-        comparison_b: str
-
-    # Create test examples
-    example1 = ComparisonExample(
-        comparison_a="Mike loves to play basketball because he passes with his teammates.",
-        comparison_b="Mike likes to play 1v1 basketball because he likes to show off his skills.",
-    )
-
-    example2 = ComparisonExample(
-        comparison_a="Mike loves to play singles tennis because he likes to only hit by himself and not with a partner and is selfish.",
-        comparison_b="Mike likes to play doubles tennis because he likes to coordinate with his partner.",
-    )
-
-    # Run evaluation
-    res = client.run_evaluation(
-        examples=[example1, example2],
-        scorers=[prompt_scorer],
-        model=DEFAULT_TOGETHER_MODEL,
-        project_name=project_name,
-        eval_run_name="test-custom-prompt-scorer",
-        override=True,
-    )
-
-    # Verify results
-    assert res[0].success, "Example 1 should pass classification"
-    assert not res[1].success, "Example 2 should fail classification"
-
-    # Print debug info if any test fails
-    print_debug_on_failure(res[0])
-    print_debug_on_failure(res[1])
-
-
 def print_debug_on_failure(result) -> bool:
     """
     Helper function to print debug info only on test failure