Skip to content

Commit 6c99695

Browse files
authored
Version 1.4 - BREAKING changes
1 parent 2741d40 commit 6c99695

File tree

10 files changed

+282
-192
lines changed

10 files changed

+282
-192
lines changed

config.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
AVAILABLE_MODELS:
2+
- BAAI/bge-large-en
3+
- BAAI/bge-base-en
4+
- BAAI/bge-small-en
5+
- thenlper/gte-large
6+
- thenlper/gte-base
7+
- thenlper/gte-small
8+
- intfloat/e5-large-v2
9+
- intfloat/e5-base-v2
10+
- intfloat/e5-small-v2
11+
- hkunlp/instructor-xl
12+
- hkunlp/instructor-large
13+
- hkunlp/instructor-base
14+
- sentence-transformers/all-mpnet-base-v2
15+
- sentence-transformers/all-MiniLM-L12-v2
16+
- sentence-transformers/all-MiniLM-L6-v2
17+
COMPUTE_DEVICE: cuda
18+
DOCUMENT_MAP:
19+
.csv: UnstructuredCSVLoader
20+
.docx: Docx2txtLoader
21+
.eml: UnstructuredEmailLoader
22+
.enex: EverNoteLoader
23+
.json: JSONLoader
24+
.msg: UnstructuredEmailLoader
25+
.pdf: PDFMinerLoader
26+
.txt: TextLoader
27+
.xls: UnstructuredExcelLoader
28+
.xlsx: UnstructuredExcelLoader
29+
EMBEDDING_MODEL_NAME: C:/PATH/Scripts/LM Search Vector Database_v1_working/Embedding_Models/BAAI--bge-base-en

document_chunker.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from langchain.text_splitter import RecursiveCharacterTextSplitter
2+
3+
def split_documents(documents):
4+
5+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=400)
6+
texts = text_splitter.split_documents(documents)
7+
return texts
8+

document_loader.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import os
2+
from concurrent.futures import ThreadPoolExecutor, as_completed
3+
from concurrent.futures import ProcessPoolExecutor
4+
5+
from langchain.docstore.document import Document
6+
from langchain.document_loaders import (
7+
PDFMinerLoader,
8+
Docx2txtLoader,
9+
TextLoader,
10+
JSONLoader,
11+
EverNoteLoader,
12+
UnstructuredEmailLoader,
13+
UnstructuredCSVLoader,
14+
UnstructuredExcelLoader
15+
)
16+
17+
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
18+
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
19+
INGEST_THREADS = os.cpu_count() or 8
20+
21+
DOCUMENT_MAP = {
22+
".pdf": PDFMinerLoader,
23+
".docx": Docx2txtLoader,
24+
".txt": TextLoader,
25+
".json": JSONLoader,
26+
".enex": EverNoteLoader,
27+
".eml": UnstructuredEmailLoader,
28+
".msg": UnstructuredEmailLoader,
29+
".csv": UnstructuredCSVLoader,
30+
".xls": UnstructuredExcelLoader,
31+
".xlsx": UnstructuredExcelLoader,
32+
}
33+
34+
def load_single_document(file_path: str) -> Document:
35+
file_extension = os.path.splitext(file_path)[1]
36+
loader_class = DOCUMENT_MAP.get(file_extension)
37+
if loader_class:
38+
loader = loader_class(file_path)
39+
else:
40+
raise ValueError("Document type is undefined")
41+
return loader.load()[0]
42+
43+
def load_document_batch(filepaths):
44+
with ThreadPoolExecutor(len(filepaths)) as exe:
45+
futures = [exe.submit(load_single_document, name) for name in filepaths]
46+
data_list = [future.result() for future in futures]
47+
return (data_list, filepaths)
48+
49+
def load_documents(source_dir: str) -> list[Document]:
50+
all_files = os.listdir(source_dir)
51+
paths = [os.path.join(source_dir, file_path) for file_path in all_files if os.path.splitext(file_path)[1] in DOCUMENT_MAP.keys()]
52+
53+
n_workers = min(INGEST_THREADS, max(len(paths), 1))
54+
chunksize = round(len(paths) / n_workers)
55+
docs = []
56+
with ProcessPoolExecutor(n_workers) as executor:
57+
futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
58+
for future in as_completed(futures):
59+
contents, _ = future.result()
60+
docs.extend(contents)
61+
62+
return docs

gui.py

Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,31 @@
11
import tkinter as tk
2+
from gui_table import create_table
3+
import threading
4+
from nvml import CudaVramLogic
5+
import torch
6+
import yaml
7+
8+
def determine_compute_device():
9+
if torch.cuda.is_available():
10+
COMPUTE_DEVICE = "cuda"
11+
elif torch.backends.mps.is_available():
12+
COMPUTE_DEVICE = "mps"
13+
else:
14+
COMPUTE_DEVICE = "cpu"
15+
16+
with open("config.yaml", 'r') as stream:
17+
config_data = yaml.safe_load(stream)
18+
config_data['COMPUTE_DEVICE'] = COMPUTE_DEVICE
19+
with open("config.yaml", 'w') as stream:
20+
yaml.safe_dump(config_data, stream)
221

322
class DocQA_GUI:
423
def __init__(self, root):
5-
self.root = root # Store the root window for later access
6-
self.file_path = tk.StringVar()
24+
self.root = root
725

8-
# Use a PanedWindow to manage the left buttons and the right text frames
926
main_pane = tk.PanedWindow(root, orient=tk.HORIZONTAL)
1027
main_pane.pack(fill=tk.BOTH, expand=1)
1128

12-
# Left Section: Buttons
1329
left_frame = tk.Frame(main_pane)
1430

1531
self.download_embedding_model_button = tk.Button(left_frame, text="Download Embedding Model", width=26)
@@ -24,16 +40,16 @@ def __init__(self, root):
2440
self.create_chromadb_button = tk.Button(left_frame, text="Create Vector Database", width=26)
2541
self.create_chromadb_button.pack(pady=5)
2642

27-
# Create table below the buttons
28-
self.create_table(left_frame)
43+
create_table(left_frame)
44+
45+
self.cuda_info_label = tk.Label(left_frame, text="CUDA & VRAM Info", font=("Segoe UI Historic", 10))
46+
self.cuda_info_label.pack(pady=5)
2947

3048
main_pane.add(left_frame)
3149

32-
# Middle and Bottom Sections: Text Input and Output
3350
right_frame = tk.Frame(main_pane)
3451
main_pane.add(right_frame)
3552

36-
# Middle Section: Text Input and Control
3753
middle_frame = tk.Frame(right_frame)
3854
middle_frame.pack(pady=5, fill=tk.BOTH, expand=1)
3955

@@ -45,11 +61,9 @@ def __init__(self, root):
4561
scroll1.pack(side=tk.RIGHT, fill=tk.Y)
4662
self.text_input.config(yscrollcommand=scroll1.set)
4763

48-
# Button between Middle and Bottom
4964
self.submit_query_button = tk.Button(right_frame, text="Submit Question", width=15)
5065
self.submit_query_button.pack(pady=5, side=tk.TOP)
5166

52-
# Bottom Section: Text Output and Actions
5367
bottom_frame = tk.Frame(right_frame)
5468
bottom_frame.pack(pady=5, fill=tk.BOTH, expand=1)
5569

@@ -61,64 +75,27 @@ def __init__(self, root):
6175
scroll2.pack(side=tk.RIGHT, fill=tk.Y)
6276
self.read_only_text.config(yscrollcommand=scroll2.set)
6377

64-
# Center the window and display it
78+
self.cuda_logic = CudaVramLogic(self.cuda_info_label, self.root)
79+
6580
self.center_window(root)
6681

6782
def center_window(self, root):
68-
root.withdraw() # Hide the window
83+
root.withdraw()
6984
root.update_idletasks()
7085
width = root.winfo_width()
7186
height = root.winfo_height()
7287
x = (root.winfo_screenwidth() // 2) - (width // 2)
7388
y = (root.winfo_screenheight() // 2) - (height // 2)
7489
root.geometry('{}x{}+{}+{}'.format(width, height, x, y))
75-
root.deiconify() # Show the window
76-
77-
def create_table(self, parent_frame):
78-
# Define the models and their corresponding VRAM values
79-
models = ["BAAI/bge-large-en", "BAAI/bge-base-en", "BAAI/bge-small-en", "thenlper/gte-large",
80-
"thenlper/gte-base", "thenlper/gte-small", "intfloat/e5-large-v2", "intfloat/e5-base-v2",
81-
"intfloat/e5-small-v2", "hkunlp/instructor-xl", "hkunlp/instructor-large", "hkunlp/instructor-base",
82-
"sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/all-MiniLM-L6-v2"]
83-
vram_values = ["5.3GB", "3.7GB", "2.9GB", "5.3GB", "3.7GB", "3GB", "5.2GB", "3.7GB", "2.9GB",
84-
"18.1GB", "6.8GB", "4.6GB", "2.7GB", "1.6GB", "1.6GB"] # Placeholder values
85-
86-
# Table frame
87-
table_frame = tk.Frame(parent_frame)
88-
table_frame.pack(pady=5, fill=tk.BOTH, expand=1)
89-
90-
# Header
91-
tk.Label(table_frame, text="Embedding Model", borderwidth=1, relief="solid").grid(row=0, column=0, sticky="nsew")
92-
tk.Label(table_frame, text="Estimated VRAM", borderwidth=1, relief="solid").grid(row=0, column=1, sticky="nsew")
93-
94-
# Content
95-
for i, (model, vram) in enumerate(zip(models, vram_values), start=1):
96-
tk.Label(table_frame, text=model, borderwidth=1, relief="solid").grid(row=i, column=0, sticky="nsew")
97-
tk.Label(table_frame, text=vram, borderwidth=1, relief="solid").grid(row=i, column=1, sticky="nsew")
98-
99-
# Adjusting column weights so they expand equally
100-
table_frame.grid_columnconfigure(0, weight=1)
101-
table_frame.grid_columnconfigure(1, weight=1)
102-
103-
# Add Pro Tip and accompanying text
104-
pro_tip_label = tk.Label(parent_frame, text="Pro tip:", font=("Segoe UI Historic", 12, "bold"))
105-
pro_tip_label.pack(pady=(20, 0), anchor="w", padx=5, side=tk.TOP)
106-
107-
pro_tip_text = ("DO NOT have LM Studio running when creating the vector database. The VRAM numbers above refer to when creating the database. "
108-
"After it's created, run LM Studio and load your LLM (remember only Llama2-based models work currently when querying the database). "
109-
"To query the database, the embedding model will use about half the VRAM it used when creating it. Use the LARGEST embedding "
110-
"model you can possibly fit into VRAM while the LLM is loaded into LM Studio (remembering the half rule above). The quality of the "
111-
"embedding model is ACTUALLY MORE important that the size of the LLM. Experiment with low-quality LLMs and high-quality embedding models. "
112-
"EXAMPLE: q3_k_3 model + instructor-xl worked just fine together.")
113-
114-
pro_tip_description = tk.Label(parent_frame, text=pro_tip_text, wraplength=400, justify="left")
115-
pro_tip_description.pack(anchor="w", padx=5, side=tk.TOP)
90+
root.deiconify()
11691

11792
if __name__ == "__main__":
93+
determine_compute_device()
11894
root = tk.Tk()
11995
root.title("Welcome to the LM Studio ChromaDB Plugin!")
120-
root.geometry("800x700") # Adjust the size slightly for the paned layout
96+
root.geometry("800x800")
12197
app = DocQA_GUI(root)
12298
from gui_logic import DocQA_Logic
12399
logic = DocQA_Logic(app)
100+
root.protocol("WM_DELETE_WINDOW", app.cuda_logic.stop_and_exit)
124101
root.mainloop()

gui_logic.py

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,23 @@
33
import os
44
import shutil
55
from glob import glob
6+
import yaml
67
from gui import DocQA_GUI
78
from server_connector import interact_with_chat
89
import subprocess
910
import server_connector
1011

12+
def load_config():
13+
with open("config.yaml", 'r') as stream:
14+
return yaml.safe_load(stream)
15+
1116
class DownloadModelDialog(simpledialog.Dialog):
1217
def body(self, master):
1318
self.model_var = tk.StringVar(value="none_selected")
14-
self.models = [
15-
"BAAI/bge-large-en", "BAAI/bge-base-en", "BAAI/bge-small-en", "thenlper/gte-large",
16-
"thenlper/gte-base", "thenlper/gte-small", "intfloat/e5-large-v2", "intfloat/e5-base-v2",
17-
"intfloat/e5-small-v2", "hkunlp/instructor-xl", "hkunlp/instructor-large", "hkunlp/instructor-base",
18-
"sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/all-MiniLM-L6-v2"
19-
]
19+
20+
config_data = load_config()
21+
self.models = config_data["AVAILABLE_MODELS"]
22+
2023
downloaded_models = [f for f in os.listdir('Embedding_Models') if os.path.isdir(os.path.join('Embedding_Models', f))]
2124

2225
for model in self.models:
@@ -32,46 +35,44 @@ def buttons(self):
3235
class DocQA_Logic:
3336
def __init__(self, gui: DocQA_GUI):
3437
self.gui = gui
35-
self.embed_model_name = ""
36-
37-
# Connecting the GUI buttons to their logic
38+
39+
config_data = load_config()
40+
self.embed_model_name = config_data.get("EMBEDDING_MODEL_NAME", "")
41+
3842
self.gui.download_embedding_model_button.config(command=self.download_embedding_model)
3943
self.gui.select_embedding_model_button.config(command=self.select_embedding_model_directory)
4044
self.gui.choose_documents_button.config(command=self.choose_documents)
4145
self.gui.create_chromadb_button.config(command=self.create_chromadb)
4246
self.gui.submit_query_button.config(command=self.submit_query)
4347

4448
def download_embedding_model(self):
45-
# Creating the "Embedding_Models" folder if it doesn't exist
4649
if not os.path.exists('Embedding_Models'):
4750
os.makedirs('Embedding_Models')
4851

49-
# Opening the dialog window
5052
dialog = DownloadModelDialog(self.gui.root)
5153
selected_model = dialog.model_var.get()
5254

5355
if selected_model:
54-
# Construct the URL for the Hugging Face model repository
5556
model_url = f"https://huggingface.co/{selected_model}"
5657

57-
# Define the directory to download the model to
5858
target_directory = os.path.join("Embedding_Models", selected_model.replace("/", "--"))
5959

60-
# Clone the repository to the directory
6160
subprocess.run(["git", "clone", model_url, target_directory])
6261

6362
def select_embedding_model_directory(self):
6463
initial_dir = 'Embedding_Models' if os.path.exists('Embedding_Models') else os.path.expanduser("~")
6564
chosen_directory = filedialog.askdirectory(initialdir=initial_dir, title="Select Embedding Model Directory")
6665

67-
# Choose the model directory to use
6866
if chosen_directory:
6967
self.embedding_model_directory = chosen_directory
68+
self.embed_model_name = chosen_directory
69+
70+
# Update the config.yaml file with the chosen model directory
71+
config_data = load_config()
72+
config_data["EMBEDDING_MODEL_NAME"] = chosen_directory
73+
with open("config.yaml", 'w') as file:
74+
yaml.dump(config_data, file)
7075

71-
# Update the global variable in server_connector.py
72-
server_connector.EMBEDDING_MODEL_NAME = chosen_directory
73-
74-
# Optionally, you can print or display a confirmation to the user
7576
print(f"Selected directory: {chosen_directory}")
7677

7778
def choose_documents(self):
@@ -90,19 +91,12 @@ def create_chromadb(self):
9091
current_dir = os.path.dirname(os.path.realpath(__file__))
9192
vector_db_folder = os.path.join(current_dir, "Vector_DB")
9293

93-
# Create the "Vector_DB" folder if it doesn't exist
9494
if not os.path.exists(vector_db_folder):
9595
os.mkdir(vector_db_folder)
9696

9797
response = messagebox.askokcancel(
98-
"Create New Vector Database?",
99-
"Proceeding will:\n\n"
100-
"(1) Delete the current database\n"
101-
"(2) Create a new ChromaDB vector database.\n\n"
102-
"If GPU acceleration is properly set up, you will see CUDA being utilized when the database is created. "
103-
"Check CUDA usage by going to Task Manager, select your GPU, and choosing "
104-
"the 'CUDA' graph from one of the pull-down menus.\n\n"
105-
"CUDA usage stops once the vector database is created and then you can ask questions of your docs!"
98+
"Create Vector Database?",
99+
"This will overwrite any current databases!"
106100
)
107101

108102
if response:
@@ -132,3 +126,8 @@ def submit_query(self):
132126
self.gui.read_only_text.insert(tk.END, answer)
133127
self.gui.read_only_text.config(state=tk.DISABLED)
134128

129+
if __name__ == "__main__":
130+
root = tk.Tk()
131+
app = DocQA_GUI(root)
132+
logic = DocQA_Logic(app)
133+
root.mainloop()

0 commit comments

Comments
 (0)