BBC-Esq
diff --git a/‎config.yaml
Lines changed: 29 additions & 0 deletions b/‎config.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎document_chunker.py
Lines changed: 8 additions & 0 deletions b/‎document_chunker.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎document_loader.py
Lines changed: 62 additions & 0 deletions b/‎document_loader.py
Lines changed: 62 additions & 0 deletions
diff --git a/‎gui.py
Lines changed: 31 additions & 54 deletions b/‎gui.py
Lines changed: 31 additions & 54 deletions
diff --git a/‎gui_logic.py
Lines changed: 27 additions & 28 deletions b/‎gui_logic.py
Lines changed: 27 additions & 28 deletions
@@ -0,0 +1,29 @@
+AVAILABLE_MODELS:
+- BAAI/bge-large-en
+- BAAI/bge-base-en
+- BAAI/bge-small-en
+- thenlper/gte-large
+- thenlper/gte-base
+- thenlper/gte-small
+- intfloat/e5-large-v2
+- intfloat/e5-base-v2
+- intfloat/e5-small-v2
+- hkunlp/instructor-xl
+- hkunlp/instructor-large
+- hkunlp/instructor-base
+- sentence-transformers/all-mpnet-base-v2
+- sentence-transformers/all-MiniLM-L12-v2
+- sentence-transformers/all-MiniLM-L6-v2
+COMPUTE_DEVICE: cuda
+DOCUMENT_MAP:
+  .csv: UnstructuredCSVLoader
+  .docx: Docx2txtLoader
+  .eml: UnstructuredEmailLoader
+  .enex: EverNoteLoader
+  .json: JSONLoader
+  .msg: UnstructuredEmailLoader
+  .pdf: PDFMinerLoader
+  .txt: TextLoader
+  .xls: UnstructuredExcelLoader
+  .xlsx: UnstructuredExcelLoader
+EMBEDDING_MODEL_NAME: C:/PATH/Scripts/LM Search Vector Database_v1_working/Embedding_Models/BAAI--bge-base-en
@@ -0,0 +1,8 @@
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+def split_documents(documents):
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=400)
+    texts = text_splitter.split_documents(documents)
+    return texts
+
@@ -0,0 +1,62 @@
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor
+
+from langchain.docstore.document import Document
+from langchain.document_loaders import (
+    PDFMinerLoader,
+    Docx2txtLoader,
+    TextLoader,
+    JSONLoader,
+    EverNoteLoader,
+    UnstructuredEmailLoader,
+    UnstructuredCSVLoader,
+    UnstructuredExcelLoader
+)
+
+ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
+INGEST_THREADS = os.cpu_count() or 8
+
+DOCUMENT_MAP = {
+    ".pdf": PDFMinerLoader,
+    ".docx": Docx2txtLoader,
+    ".txt": TextLoader,
+    ".json": JSONLoader,
+    ".enex": EverNoteLoader,
+    ".eml": UnstructuredEmailLoader,
+    ".msg": UnstructuredEmailLoader,
+    ".csv": UnstructuredCSVLoader,
+    ".xls": UnstructuredExcelLoader,
+    ".xlsx": UnstructuredExcelLoader,
+}
+
+def load_single_document(file_path: str) -> Document:
+    file_extension = os.path.splitext(file_path)[1]
+    loader_class = DOCUMENT_MAP.get(file_extension)
+    if loader_class:
+        loader = loader_class(file_path)
+    else:
+        raise ValueError("Document type is undefined")
+    return loader.load()[0]
+
+def load_document_batch(filepaths):
+    with ThreadPoolExecutor(len(filepaths)) as exe:
+        futures = [exe.submit(load_single_document, name) for name in filepaths]
+        data_list = [future.result() for future in futures]
+        return (data_list, filepaths)
+
+def load_documents(source_dir: str) -> list[Document]:
+    all_files = os.listdir(source_dir)
+    paths = [os.path.join(source_dir, file_path) for file_path in all_files if os.path.splitext(file_path)[1] in DOCUMENT_MAP.keys()]
+    
+    n_workers = min(INGEST_THREADS, max(len(paths), 1))
+    chunksize = round(len(paths) / n_workers)
+    docs = []
+    with ProcessPoolExecutor(n_workers) as executor:
+        futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
+        for future in as_completed(futures):
+            contents, _ = future.result()
+            docs.extend(contents)
+
+    return docs
@@ -1,15 +1,31 @@
 import tkinter as tk
+from gui_table import create_table
+import threading
+from nvml import CudaVramLogic
+import torch
+import yaml
+
+def determine_compute_device():
+    if torch.cuda.is_available():
+        COMPUTE_DEVICE = "cuda"
+    elif torch.backends.mps.is_available():
+        COMPUTE_DEVICE = "mps"
+    else:
+        COMPUTE_DEVICE = "cpu"
+    
+    with open("config.yaml", 'r') as stream:
+        config_data = yaml.safe_load(stream)
+    config_data['COMPUTE_DEVICE'] = COMPUTE_DEVICE
+    with open("config.yaml", 'w') as stream:
+        yaml.safe_dump(config_data, stream)
 
 class DocQA_GUI:
     def __init__(self, root):
-        self.root = root  # Store the root window for later access
-        self.file_path = tk.StringVar()
+        self.root = root
 
-        # Use a PanedWindow to manage the left buttons and the right text frames
         main_pane = tk.PanedWindow(root, orient=tk.HORIZONTAL)
         main_pane.pack(fill=tk.BOTH, expand=1)
 
-        # Left Section: Buttons
         left_frame = tk.Frame(main_pane)
 
         self.download_embedding_model_button = tk.Button(left_frame, text="Download Embedding Model", width=26)
@@ -24,16 +40,16 @@ def __init__(self, root):
         self.create_chromadb_button = tk.Button(left_frame, text="Create Vector Database", width=26)
         self.create_chromadb_button.pack(pady=5)
 
-        # Create table below the buttons
-        self.create_table(left_frame)
+        create_table(left_frame)
+
+        self.cuda_info_label = tk.Label(left_frame, text="CUDA & VRAM Info", font=("Segoe UI Historic", 10))
+        self.cuda_info_label.pack(pady=5)
 
         main_pane.add(left_frame)
 
-        # Middle and Bottom Sections: Text Input and Output
         right_frame = tk.Frame(main_pane)
         main_pane.add(right_frame)
 
-        # Middle Section: Text Input and Control
         middle_frame = tk.Frame(right_frame)
         middle_frame.pack(pady=5, fill=tk.BOTH, expand=1)
 
@@ -45,11 +61,9 @@ def __init__(self, root):
         scroll1.pack(side=tk.RIGHT, fill=tk.Y)
         self.text_input.config(yscrollcommand=scroll1.set)
 
-        # Button between Middle and Bottom
         self.submit_query_button = tk.Button(right_frame, text="Submit Question", width=15)
         self.submit_query_button.pack(pady=5, side=tk.TOP)
 
-        # Bottom Section: Text Output and Actions
         bottom_frame = tk.Frame(right_frame)
         bottom_frame.pack(pady=5, fill=tk.BOTH, expand=1)
 
@@ -61,64 +75,27 @@ def __init__(self, root):
         scroll2.pack(side=tk.RIGHT, fill=tk.Y)
         self.read_only_text.config(yscrollcommand=scroll2.set)
 
-        # Center the window and display it
+        self.cuda_logic = CudaVramLogic(self.cuda_info_label, self.root)
+
         self.center_window(root)
 
     def center_window(self, root):
-        root.withdraw()  # Hide the window
+        root.withdraw()
         root.update_idletasks()
         width = root.winfo_width()
         height = root.winfo_height()
         x = (root.winfo_screenwidth() // 2) - (width // 2)
         y = (root.winfo_screenheight() // 2) - (height // 2)
         root.geometry('{}x{}+{}+{}'.format(width, height, x, y))
-        root.deiconify()  # Show the window
-
-    def create_table(self, parent_frame):
-        # Define the models and their corresponding VRAM values
-        models = ["BAAI/bge-large-en", "BAAI/bge-base-en", "BAAI/bge-small-en", "thenlper/gte-large",
-                  "thenlper/gte-base", "thenlper/gte-small", "intfloat/e5-large-v2", "intfloat/e5-base-v2",
-                  "intfloat/e5-small-v2", "hkunlp/instructor-xl", "hkunlp/instructor-large", "hkunlp/instructor-base",
-                  "sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/all-MiniLM-L6-v2"]
-        vram_values = ["5.3GB", "3.7GB", "2.9GB", "5.3GB", "3.7GB", "3GB", "5.2GB", "3.7GB", "2.9GB",
-                       "18.1GB", "6.8GB", "4.6GB", "2.7GB", "1.6GB", "1.6GB"]  # Placeholder values
-
-        # Table frame
-        table_frame = tk.Frame(parent_frame)
-        table_frame.pack(pady=5, fill=tk.BOTH, expand=1)
-
-        # Header
-        tk.Label(table_frame, text="Embedding Model", borderwidth=1, relief="solid").grid(row=0, column=0, sticky="nsew")
-        tk.Label(table_frame, text="Estimated VRAM", borderwidth=1, relief="solid").grid(row=0, column=1, sticky="nsew")
-
-        # Content
-        for i, (model, vram) in enumerate(zip(models, vram_values), start=1):
-            tk.Label(table_frame, text=model, borderwidth=1, relief="solid").grid(row=i, column=0, sticky="nsew")
-            tk.Label(table_frame, text=vram, borderwidth=1, relief="solid").grid(row=i, column=1, sticky="nsew")
-
-        # Adjusting column weights so they expand equally
-        table_frame.grid_columnconfigure(0, weight=1)
-        table_frame.grid_columnconfigure(1, weight=1)
-
-        # Add Pro Tip and accompanying text
-        pro_tip_label = tk.Label(parent_frame, text="Pro tip:", font=("Segoe UI Historic", 12, "bold"))
-        pro_tip_label.pack(pady=(20, 0), anchor="w", padx=5, side=tk.TOP)
-
-        pro_tip_text = ("DO NOT have LM Studio running when creating the vector database.  The VRAM numbers above refer to when creating the database. "
-                        "After it's created, run LM Studio and load your LLM (remember only Llama2-based models work currently when querying the database). "
-                        "To query the database, the embedding model will use about half the VRAM it used when creating it.  Use the LARGEST embedding "
-                        "model you can possibly fit into VRAM while the LLM is loaded into LM Studio (remembering the half rule above).  The quality of the "
-                        "embedding model is ACTUALLY MORE important that the size of the LLM.  Experiment with low-quality LLMs and high-quality embedding models. "
-                        "EXAMPLE: q3_k_3 model + instructor-xl worked just fine together.")
-
-        pro_tip_description = tk.Label(parent_frame, text=pro_tip_text, wraplength=400, justify="left")
-        pro_tip_description.pack(anchor="w", padx=5, side=tk.TOP)
+        root.deiconify()
 
 if __name__ == "__main__":
+    determine_compute_device()
     root = tk.Tk()
     root.title("Welcome to the LM Studio ChromaDB Plugin!")
-    root.geometry("800x700")  # Adjust the size slightly for the paned layout
+    root.geometry("800x800")
     app = DocQA_GUI(root)
     from gui_logic import DocQA_Logic
     logic = DocQA_Logic(app)
+    root.protocol("WM_DELETE_WINDOW", app.cuda_logic.stop_and_exit)
     root.mainloop()
@@ -3,20 +3,23 @@
 import os
 import shutil
 from glob import glob
+import yaml
 from gui import DocQA_GUI
 from server_connector import interact_with_chat
 import subprocess
 import server_connector
 
+def load_config():
+    with open("config.yaml", 'r') as stream:
+        return yaml.safe_load(stream)
+
 class DownloadModelDialog(simpledialog.Dialog):
     def body(self, master):
         self.model_var = tk.StringVar(value="none_selected")
-        self.models = [
-            "BAAI/bge-large-en", "BAAI/bge-base-en", "BAAI/bge-small-en", "thenlper/gte-large",
-            "thenlper/gte-base", "thenlper/gte-small", "intfloat/e5-large-v2", "intfloat/e5-base-v2",
-            "intfloat/e5-small-v2", "hkunlp/instructor-xl", "hkunlp/instructor-large", "hkunlp/instructor-base",
-            "sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/all-MiniLM-L6-v2"
-]
+        
+        config_data = load_config()
+        self.models = config_data["AVAILABLE_MODELS"]
+        
         downloaded_models = [f for f in os.listdir('Embedding_Models') if os.path.isdir(os.path.join('Embedding_Models', f))]
 
         for model in self.models:
@@ -32,46 +35,44 @@ def buttons(self):
 class DocQA_Logic:
     def __init__(self, gui: DocQA_GUI):
         self.gui = gui
-        self.embed_model_name = ""
-
-        # Connecting the GUI buttons to their logic
+        
+        config_data = load_config()
+        self.embed_model_name = config_data.get("EMBEDDING_MODEL_NAME", "")
+        
         self.gui.download_embedding_model_button.config(command=self.download_embedding_model)
         self.gui.select_embedding_model_button.config(command=self.select_embedding_model_directory)
         self.gui.choose_documents_button.config(command=self.choose_documents)
         self.gui.create_chromadb_button.config(command=self.create_chromadb)
         self.gui.submit_query_button.config(command=self.submit_query)
 
     def download_embedding_model(self):
-        # Creating the "Embedding_Models" folder if it doesn't exist
         if not os.path.exists('Embedding_Models'):
             os.makedirs('Embedding_Models')
 
-        # Opening the dialog window
         dialog = DownloadModelDialog(self.gui.root)
         selected_model = dialog.model_var.get()
 
         if selected_model:
-            # Construct the URL for the Hugging Face model repository
             model_url = f"https://huggingface.co/{selected_model}"
 
-            # Define the directory to download the model to
             target_directory = os.path.join("Embedding_Models", selected_model.replace("/", "--"))
 
-            # Clone the repository to the directory
             subprocess.run(["git", "clone", model_url, target_directory])
 
     def select_embedding_model_directory(self):
         initial_dir = 'Embedding_Models' if os.path.exists('Embedding_Models') else os.path.expanduser("~")
         chosen_directory = filedialog.askdirectory(initialdir=initial_dir, title="Select Embedding Model Directory")
 
-        # Choose the model directory to use
         if chosen_directory:
             self.embedding_model_directory = chosen_directory
+            self.embed_model_name = chosen_directory
+            
+            # Update the config.yaml file with the chosen model directory
+            config_data = load_config()
+            config_data["EMBEDDING_MODEL_NAME"] = chosen_directory
+            with open("config.yaml", 'w') as file:
+                yaml.dump(config_data, file)
 
-            # Update the global variable in server_connector.py
-            server_connector.EMBEDDING_MODEL_NAME = chosen_directory
-        
-            # Optionally, you can print or display a confirmation to the user
             print(f"Selected directory: {chosen_directory}")
 
     def choose_documents(self):
@@ -90,19 +91,12 @@ def create_chromadb(self):
         current_dir = os.path.dirname(os.path.realpath(__file__))
         vector_db_folder = os.path.join(current_dir, "Vector_DB")
 
-        # Create the "Vector_DB" folder if it doesn't exist
         if not os.path.exists(vector_db_folder):
             os.mkdir(vector_db_folder)
 
         response = messagebox.askokcancel(
-            "Create New Vector Database?",
-            "Proceeding will:\n\n" 
-            "(1) Delete the current database\n"
-            "(2) Create a new ChromaDB vector database.\n\n"
-            "If GPU acceleration is properly set up, you will see CUDA being utilized when the database is created. "
-            "Check CUDA usage by going to Task Manager, select your GPU, and choosing "
-            "the 'CUDA' graph from one of the pull-down menus.\n\n"
-            "CUDA usage stops once the vector database is created and then you can ask questions of your docs!"
+            "Create Vector Database?",
+            "This will overwrite any current databases!"
         )
 
         if response:
@@ -132,3 +126,8 @@ def submit_query(self):
         self.gui.read_only_text.insert(tk.END, answer)
         self.gui.read_only_text.config(state=tk.DISABLED)
 
+if __name__ == "__main__":
+    root = tk.Tk()
+    app = DocQA_GUI(root)
+    logic = DocQA_Logic(app)
+    root.mainloop()