refactor: Overhauled utility classes for ease of use

Wermutton · Wermutton · commit 7c18212e36ea · 2023-09-28T04:16:39.000-07:00
diff --git a/docs/hello-world.md b/docs/hello-world.md
@@ -44,30 +44,50 @@ from langdiversity.parser import extract_math_answer
 model = OpenAIModel(openai_api_key="[API KEY]", extractor=extract_math_answer)
 ```
 
-### Prompt Selection
+### Collecting Diversity Measures
 
-Now, we initialize the `PromptSelection` object. This is where we specify how many responses we want from the language model for each prompt, the diversity measure to use, and the selection method.
+In this step, we initialize the `DiversityMeasureCollector` object. This is where we specify how many responses we want from the language model for each prompt and the diversity measure to use.
 
 ```python
-from langdiversity.utils import PromptSelection
-prompt_selection = PromptSelection(model=model, num_responses=4, diversity_measure=diversity_measure, selection='min')
+from langdiversity.utils import DiversityMeasureCollector
+diversity_collector = DiversityMeasureCollector(model=model, num_responses=4, diversity_measure=diversity_measure)
 ```
 
-### Generate and Select Prompts
-
-Finally, we pass in a list of prompts to the `PromptSelection` object. It will send these prompts to the language model, calculate the diversity measure for each set of responses, and then select the prompt with the minimum (or maximum) diversity measure.
+### Collecting Data
 
-The selected prompt and its corresponding diversity measure are stored in `selected_prompt` and `selected_measure`, respectively.
+Next, we pass in a list of prompts to the `DiversityMeasureCollector` object. It will send these prompts to the language model, collect the responses, and calculate the diversity measure for each set of responses.
 
 Note: The prompts are structured to guide the language model in generating a specific type of response. This makes it easier for the parser to extract clean answers.
 
 ```python
-selected_prompt, selected_measure = prompt_selection.generate([
+prompts = [
             "At the end, say 'the answer is [put your numbers here separated by commas]'.\nQuestion: What is the speed of the current if Junior's boat can cover 12 miles downstream in the same time it takes to travel 9 miles upstream, given that his boat's speed in still water is 15 miles per hour?",
             "At the end, say 'the answer is [put your numbers here separated by commas]'.\nQuestion: What is the speed of the current if Junior's boat travels at a constant speed of 15 miles per hour in still water and he spends the same amount of time traveling 12 miles downstream as he does traveling 9 miles upstream?.",
             "At the end, say 'the answer is [put your numbers here separated by commas]'.\nQuestion: Juniors boat will go 15 miles per hour in still water . If he can go 12 miles downstream in the same amount of time as it takes to go 9 miles upstream , then what is the speed of the current?",
-])
+
+]
+
+diversity_collector.collect(prompts, verbose=True)  # Set verbose to True to see intermediate values
+```
+
+### Prompt Selection
+
+Now, we initialize the `PromptSelection` object with the data collected in the previous step.
+
+```python
+from langdiversity.utils import PromptSelection
+prompt_selection = PromptSelection(data=diversity_collector.data, selection='min')
+```
+
+### Selecting Prompts
+
+Finally, we call the select method on the `PromptSelection` object to select the prompt with the desired diversity measure based on the user's selection method.
+
+In this example, the selected prompt and its corresponding diversity measure are stored in `selected_prompt` and `selected_measure`, respectively.
+
+```python
+selected_prompt, selected_diversity = prompt_selection.select()
 
 print("Selected Prompt:", selected_prompt)
-print("Selected Measure:", selected_measure)
+print("Selected Diversity:", selected_diversity)
 ```
diff --git a/docs/langdiversity_library.md b/docs/langdiversity_library.md
@@ -13,23 +13,37 @@ pip install langdiversity
 Example:
 
 ```python
+import os
+from dotenv import load_dotenv
+
+from langdiversity.utils import PromptSelection, DiversityMeasureCollector
 from langdiversity.models import OpenAIModel
 from langdiversity.measures import ShannonEntropyMeasure
-from langdiversity.utils import PromptSelection
-from langdiversity.parser import # Select a parser that suits your question set
+from langdiversity.parser import extract_last_letters  # Select a parser that suits your question set
+
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")  # place your language model's API key in a .env file
 
 # Initialize the OpenAI model and diversity measure
-model = OpenAIModel(openai_api_key="[YOUR API KEY]", extractor="[SELECT YOUR PARSER](optional)")
+model = OpenAIModel(openai_api_key=openai_api_key, extractor=extract_last_letters)
 diversity_measure = ShannonEntropyMeasure()
 
-# Use the PromptSelection utility
-prompt_selection = PromptSelection(model=model, num_responses=10, diversity_measure=diversity_measure)
+# Define your list of prompts
+prompts = [
+    "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Take the last letter of each word in \"Tal Evan Lesley Sidney\" and concatenate them..",
+    # ... Add more prompts as needed
+]
+
+# Create an instance of DiversityMeasureCollector and collect diversity measures
+diversity_collector = DiversityMeasureCollector(model=model, num_responses=4, diversity_measure=diversity_measure)
+diversity_collector.collect(prompts)
 
-# Pass in question set to the LLM & selects the prompt with the configured diversity measure criteria from the LLM's 10 responses
-selected_prompt, selected_measure = prompt_selection.generate(["Your list of prompts here..."])
+# Create an instance of PromptSelection and select a prompt
+prompt_selection = PromptSelection(data=diversity_collector.data, selection="min")
+selected_prompt, selected_measure = prompt_selection.select()
 
-print("Selected Prompt:", selected_prompt)
-print("Selected Measure:", selected_measure)
+print("Selected prompt:", selected_prompt)
+print("Selected measure:", selected_measure)
 ```
 
 ### Modules:
@@ -48,21 +62,25 @@ LangDiversity offers a variety of modules for different use-cases. Below are the
 - [Utility Classes](https://github.com/lab-v2/langdiversity/tree/main/langdiversity/utils) (`langdiversity.utils`)
 
   - `PromptSelection`: Handles the selection of prompts based on diversity measures.
-  - `DiversityCalculator`: Calculates various diversity measures for a given set of values. Supports Shannon's entropy and Gini impurity by default.
+  - `DiversityMeasureCollector`: Collects diversity measures for a given set of prompts using a specified language model and diversity measure algorithm.
 
 - [Parsers](https://github.com/lab-v2/langdiversity/tree/main/langdiversity/parser) (`langdiversity.parsers`)
   - `extract_last_letters(response: str)`: Extracts the last letters of each word in the response.
   - `extract_math_answer(response: str)`: Extracts numerical answers from a mathematical question in the response.
   - `extract_multi_choice_answer(response: str)`: Extracts the selected choice (A, B, C, D, E) from a multiple-choice question in the response.
 
-### PromptSelection Paramaters:
+### DiversityMeasureCollector Paramaters:
 
 - `model`: The language model you want to use. In this example, we're using OpenAI's model.
 
 - `diversity_measure`: The measure of diversity measure you want to use. Here, we're using entropy.
 
 - `num_responses`: The number of responses you want the model to generate for each prompt. Default is 1.
 
+### PromptSelection Parameters:
+
+- `data`: A list of dictionaries, each containing information about a prompt, the responses it generated, and its diversity measure. This data is collected using the `DiversityMeasureCollector` class.
+
 - `selection`: Determines how the best prompt is selected based on its diversity measure. It can be:
 
   - `"min"`: Selects the prompt with the minimum diversity measure. (default)
diff --git a/examples/prompt_selection.py b/examples/prompt_selection.py
@@ -1,7 +1,7 @@
 import os
 from dotenv import load_dotenv
 
-from langdiversity.utils import PromptSelection
+from langdiversity.utils import PromptSelection, DiversityMeasureCollector
 from langdiversity.models import OpenAIModel
 from langdiversity.measures import ShannonEntropyMeasure
 from langdiversity.parser import extract_last_letters
@@ -11,16 +11,18 @@
 
 diversity_measure = ShannonEntropyMeasure()
 model = OpenAIModel(openai_api_key=openai_api_key, extractor=extract_last_letters)
-prompt_selection = PromptSelection(
-    model=model, num_responses=4, diversity_measure=diversity_measure, selection="min"
-)
-selected_prompt, selected_diversity = prompt_selection.generate(
-        [
-            "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Take the last letter of each word in \"Tal Evan Lesley Sidney\" and concatenate them..",
-            "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Concatenate the last letter of each word in \"Tal Evan Lesley Sidney\".",
-            "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Combine the last letter of each word in \"Tal Evan Lesley Sidney\".",
-        ]
-    )
+
+prompts = [
+    "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Take the last letter of each word in \"Tal Evan Lesley Sidney\" and concatenate them..",
+    "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Concatenate the last letter of each word in \"Tal Evan Lesley Sidney\".",
+    "At the end, say 'the answer is [put the concatenated word here]'.\nQuestion: Combine the last letter of each word in \"Tal Evan Lesley Sidney\".",
+]
+
+diversity_collector = DiversityMeasureCollector(model=model, num_responses=4, diversity_measure=diversity_measure)
+diversity_collector.collect(prompts, verbose=True)  
+
+prompt_selection = PromptSelection(data=diversity_collector.data, selection="min")
+selected_prompt, selected_diversity = prompt_selection.select() 
 
 print("SELECTED PROMPT:", selected_prompt)
-print("SELECTED DIVERSITY:", selected_diversity)
+print("SELECTED DIVERSITY:", selected_diversity)
diff --git a/langdiversity/utils/__init__.py b/langdiversity/utils/__init__.py
@@ -1,2 +1,3 @@
 from .calculate_measures import *
+from .diversity_measures import *
 from .prompt_selection import *
diff --git a/langdiversity/utils/diversity_measures.py b/langdiversity/utils/diversity_measures.py
@@ -0,0 +1,28 @@
+from typing import List
+
+from langdiversity.extras.spinner import loading_spinner
+
+from langdiversity.measures import AbstractMeasure
+from langdiversity.models import AbstractBaseModel
+
+class DiversityMeasureCollector:
+    def __init__(self, model: AbstractBaseModel, diversity_measure: AbstractMeasure, num_responses: int = 1):
+        self.model = model
+        self.diversity_measure = diversity_measure
+        self.num_responses = num_responses
+        self.data = []  # A list to store the data (prompt, responses, diversity measure)
+
+    def collect(self, prompts: List[str], verbose: bool = False):
+        total_prompts = len(prompts)
+        for i, prompt in enumerate(prompts):
+            with loading_spinner(f"Collecting {self.num_responses} responses...", current_step=i + 1, total_steps=total_prompts):
+                responses = self.model.generate(prompt, self.num_responses)
+            with loading_spinner("Performing diversity measure calculations...", current_step=i + 1, total_steps=total_prompts):
+                diversity = self.diversity_measure.generate(responses)
+            if verbose:
+                print(f"Prompt {i + 1}: {prompt}")
+                print(f"Responses: {', '.join(responses)}")  # Assuming responses are strings
+                print(f"Diversity: {diversity}")
+            self.data.append(
+                {"prompt": prompt, "responses": responses, "diversity": diversity}
+            )
diff --git a/langdiversity/utils/prompt_selection.py b/langdiversity/utils/prompt_selection.py
@@ -1,53 +1,18 @@
-from typing import List
-
-from langdiversity.extras.spinner import loading_spinner
-
-from langdiversity.measures import AbstractMeasure
-from langdiversity.models import AbstractBaseModel
+from typing import List, Dict, Union
 
 class PromptSelection:
-    def __init__(
-        self,
-        model: AbstractBaseModel,
-        diversity_measure: AbstractMeasure,
-        num_responses: int = 1,
-        selection: str = "min",
-    ):
+    def __init__(self, data: List[Dict[str, Union[str, List[str], float]]], selection: str = "min"):
         valid_selections = ["min", "max"]
         if selection not in valid_selections:
             raise ValueError(
                 "Invalid selection type. Expected one of %s" % valid_selections
             )
-        self.model = model
-        self.diversity_measure = diversity_measure
-        self.num_responses = num_responses
+        self.data = data
         self.selection = selection
 
-    def generate(self, prompts: List[str]):
-        if len(prompts) == 0:
-            raise ValueError("Invalid prompts. There should be at least 1 prompt.")
-
-        selected_prompt = ""
-        selected_diversity = float("inf") if self.selection == "min" else float("-inf")
-
-        info = []
-
-        total_prompts = len(prompts)
-        for i, prompt in enumerate(prompts):
-            with loading_spinner(f"Collecting {self.num_responses} responses...", current_step=i+1, total_steps=total_prompts):
-                responses = self.model.generate(prompt, self.num_responses)
-            with loading_spinner("Performing diversity measure calculations...", current_step=i+1, total_steps=total_prompts):
-                diversity = self.diversity_measure.generate(responses)
-
-            if self.selection == "max" and diversity > selected_diversity:
-                selected_diversity = diversity
-                selected_prompt = prompt
-            if self.selection == "min" and diversity < selected_diversity:
-                selected_diversity = diversity
-                selected_prompt = prompt
-
-            info.append(
-                {"responses": responses, "diversity": diversity, "prompt": prompt}
-            )
+    def select(self):
+        if not self.data:
+            raise ValueError("No data to select from.")
 
-        return selected_prompt, selected_diversity
+        selected_item = min(self.data, key=lambda x: x['diversity']) if self.selection == "min" else max(self.data, key=lambda x: x['diversity'])
+        return selected_item['prompt'], selected_item['diversity']
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 setup(
     name='langdiversity',
     packages=find_packages(exclude=['tests']),
-    version='1.0.5',
+    version='1.1.0',
     description='A tool to elevate your language models with insightful diversity metrics.',
     long_description=long_description,  
     long_description_content_type="text/markdown",  

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .calculate_measures import *`
	`2`	`+from .diversity_measures import *`
`2`	`3`	`from .prompt_selection import *`