Skip to content

Commit 4f9e36e

Browse files
authored
bug fixes
1 parent ac4fecc commit 4f9e36e

File tree

5 files changed

+145
-189
lines changed

5 files changed

+145
-189
lines changed
Lines changed: 108 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,147 @@
11
from pathlib import Path
22
import yaml
3-
from PySide6.QtWidgets import (QFileDialog, QDialog, QVBoxLayout, QTextEdit,
4-
QPushButton, QHBoxLayout, QMessageBox)
5-
from create_symlinks import create_symlinks_parallel
6-
7-
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx',
8-
'.rtf', '.odt', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff', '.html',
9-
'.htm', '.md', '.doc'}
3+
from PySide6.QtCore import QThread, Signal, Qt, QElapsedTimer
4+
from PySide6.QtWidgets import (QFileDialog, QDialog, QVBoxLayout, QTextEdit, QPushButton, QHBoxLayout, QMessageBox, QProgressDialog, QApplication, QFileSystemModel)
5+
from multiprocessing import Pool, cpu_count
6+
from create_symlinks import _create_single_symlink
7+
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx', '.rtf', '.odt', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff', '.html', '.htm', '.md', '.doc'}
108
DOCS_FOLDER = "Docs_for_DB"
119
CONFIG_FILE = "config.yaml"
12-
10+
class SymlinkWorker(QThread):
11+
progress = Signal(int)
12+
finished = Signal(int, list)
13+
def __init__(self, source, target_dir, parent=None):
14+
super().__init__(parent)
15+
self.source = source
16+
self.target_dir = Path(target_dir)
17+
def run(self):
18+
if isinstance(self.source, (str, Path)):
19+
dir_path = Path(self.source)
20+
files = [str(p) for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in ALLOWED_EXTENSIONS]
21+
else:
22+
files = list(self.source)
23+
total = len(files)
24+
made = 0
25+
errors = []
26+
last_pct = -1
27+
timer = QElapsedTimer()
28+
timer.start()
29+
step = max(1, total // 100) if total else 1
30+
if total > 1000:
31+
processes = min((total // 10000) + 1, cpu_count())
32+
file_args = [(f, str(self.target_dir)) for f in files]
33+
with Pool(processes=processes) as pool:
34+
for i, (ok, err) in enumerate(pool.imap_unordered(_create_single_symlink, file_args), 1):
35+
if ok:
36+
made += 1
37+
if err:
38+
errors.append(err)
39+
if i % step == 0 or i == total:
40+
pct = int(i * 100 / total) if total else 100
41+
if pct != last_pct and timer.elapsed() > 500:
42+
self.progress.emit(pct)
43+
last_pct = pct
44+
timer.restart()
45+
else:
46+
for f in files:
47+
if self.isInterruptionRequested():
48+
break
49+
ok, err = _create_single_symlink((f, str(self.target_dir)))
50+
if ok:
51+
made += 1
52+
if err:
53+
errors.append(err)
54+
if made % step == 0 or made == total:
55+
pct = int(made * 100 / total) if total else 100
56+
if pct != last_pct and timer.elapsed() > 500:
57+
self.progress.emit(pct)
58+
last_pct = pct
59+
timer.restart()
60+
self.finished.emit(made, errors)
1361
def choose_documents_directory():
1462
current_dir = Path(__file__).parent.resolve()
1563
target_dir = current_dir / DOCS_FOLDER
1664
target_dir.mkdir(parents=True, exist_ok=True)
17-
1865
msg_box = QMessageBox()
1966
msg_box.setWindowTitle("Selection Type")
2067
msg_box.setText("Would you like to select a directory or individual files?")
2168
dir_button = msg_box.addButton("Select Directory", QMessageBox.ActionRole)
2269
files_button = msg_box.addButton("Select Files", QMessageBox.ActionRole)
2370
cancel_button = msg_box.addButton("Cancel", QMessageBox.RejectRole)
24-
2571
msg_box.exec()
2672
clicked_button = msg_box.clickedButton()
27-
2873
if clicked_button == cancel_button:
2974
return
30-
3175
file_dialog = QFileDialog()
32-
76+
def start_worker(source):
77+
progress = QProgressDialog("Creating symlinks...", "Cancel", 0, 0)
78+
progress.setWindowModality(Qt.WindowModal)
79+
progress.setMinimumDuration(0)
80+
worker = SymlinkWorker(source, target_dir)
81+
main_window = _get_main_window()
82+
if main_window and hasattr(main_window, 'databases_tab'):
83+
db_tab = main_window.databases_tab
84+
if hasattr(db_tab, 'docs_model') and db_tab.docs_model:
85+
if hasattr(QFileSystemModel, 'DontWatchForChanges'):
86+
db_tab.docs_model.setOption(QFileSystemModel.DontWatchForChanges, True)
87+
if hasattr(db_tab, 'docs_refresh'):
88+
db_tab.docs_refresh.start()
89+
progress.canceled.connect(worker.requestInterruption)
90+
def update_progress(pct):
91+
if progress.maximum() == 0:
92+
progress.setRange(0, 100)
93+
progress.setValue(pct)
94+
worker.progress.connect(update_progress)
95+
def _done(count, errs):
96+
if main_window and hasattr(main_window, 'databases_tab'):
97+
db_tab = main_window.databases_tab
98+
if hasattr(db_tab, 'docs_refresh'):
99+
db_tab.docs_refresh.stop()
100+
if hasattr(db_tab, 'docs_model') and db_tab.docs_model:
101+
if hasattr(db_tab.docs_model, 'refresh'):
102+
db_tab.docs_model.refresh()
103+
elif hasattr(db_tab.docs_model, 'reindex'):
104+
db_tab.docs_model.reindex()
105+
if hasattr(QFileSystemModel, 'DontWatchForChanges'):
106+
db_tab.docs_model.setOption(QFileSystemModel.DontWatchForChanges, False)
107+
progress.reset()
108+
msg = f"Created {count} symlinks"
109+
if errs:
110+
msg += f" – {len(errs)} errors (see console)"
111+
print(*errs, sep="\n")
112+
QMessageBox.information(None, "Symlinks", msg)
113+
worker.finished.connect(_done)
114+
worker.progress.connect(update_progress)
115+
worker.start()
116+
choose_documents_directory._symlink_thread = worker
33117
if clicked_button == dir_button:
34118
file_dialog.setFileMode(QFileDialog.Directory)
35119
file_dialog.setOption(QFileDialog.ShowDirsOnly, True)
36120
selected_dir = file_dialog.getExistingDirectory(None, "Choose Directory for Database", str(current_dir))
37121
if selected_dir:
38-
selected_dir_path = Path(selected_dir)
39-
compatible_files = []
40-
incompatible_files = []
41-
42-
for file_path in selected_dir_path.iterdir():
43-
if file_path.is_file():
44-
if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
45-
compatible_files.append(str(file_path))
46-
else:
47-
incompatible_files.append(file_path.name)
48-
49-
if incompatible_files:
50-
if not show_incompatible_files_dialog(incompatible_files):
51-
return
52-
53-
if compatible_files:
54-
try:
55-
count, errors = create_symlinks_parallel(compatible_files, target_dir)
56-
if errors:
57-
print("Errors occurred while creating symlinks:", errors)
58-
except Exception as e:
59-
print(f"Error creating symlinks: {e}")
122+
start_worker(Path(selected_dir))
60123
else:
61124
file_dialog.setFileMode(QFileDialog.ExistingFiles)
62125
file_paths = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))[0]
63126
if file_paths:
64127
compatible_files = []
65128
incompatible_files = []
66-
67129
for file_path in file_paths:
68130
path = Path(file_path)
69131
if path.suffix.lower() in ALLOWED_EXTENSIONS:
70132
compatible_files.append(str(path))
71133
else:
72134
incompatible_files.append(path.name)
73-
74-
if incompatible_files:
75-
if not show_incompatible_files_dialog(incompatible_files):
76-
return
77-
135+
if incompatible_files and not show_incompatible_files_dialog(incompatible_files):
136+
return
78137
if compatible_files:
79-
try:
80-
count, errors = create_symlinks_parallel(compatible_files, target_dir)
81-
if errors:
82-
print("Errors occurred while creating symlinks:", errors)
83-
except Exception as e:
84-
print(f"Error creating symlinks: {e}")
85-
138+
start_worker(compatible_files)
86139
def show_incompatible_files_dialog(incompatible_files):
87-
dialog_text = (
88-
"The following files cannot be added here due to their file extension:\n\n" +
89-
"\n".join(incompatible_files) +
90-
"\n\nHowever, if any of them are audio files you can still add them directly in the Tools Tab."
91-
"\n\nClick 'Ok' to add the compatible documents only (remembering to add audio files separately) or 'Cancel' to back out completely."
92-
)
140+
dialog_text = ("The following files cannot be added here due to their file extension:\n\n" + "\n".join(incompatible_files) + "\n\nHowever, if any of them are audio files you can still add them directly in the Tools Tab."
141+
"\n\nClick 'Ok' to add the compatible documents only (remembering to add audio files separately) or 'Cancel' to back out completely.")
93142
incompatible_dialog = QDialog()
94143
incompatible_dialog.resize(800, 600)
95144
incompatible_dialog.setWindowTitle("Incompatible Files Detected")
96-
97145
layout = QVBoxLayout()
98146
text_edit = QTextEdit()
99147
text_edit.setReadOnly(True)
@@ -109,18 +157,19 @@ def show_incompatible_files_dialog(incompatible_files):
109157
ok_button.clicked.connect(incompatible_dialog.accept)
110158
cancel_button.clicked.connect(incompatible_dialog.reject)
111159
return incompatible_dialog.exec() == QDialog.Accepted
112-
113160
def load_config():
114161
with open(CONFIG_FILE, 'r', encoding='utf-8') as stream:
115162
return yaml.safe_load(stream)
116-
117163
def select_embedding_model_directory():
118164
initial_dir = Path('Models') if Path('Models').exists() else Path.home()
119165
chosen_directory = QFileDialog.getExistingDirectory(None, "Select Embedding Model Directory", str(initial_dir))
120-
121166
if chosen_directory:
122167
config_file_path = Path(CONFIG_FILE)
123168
config_data = yaml.safe_load(config_file_path.read_text(encoding='utf-8')) if config_file_path.exists() else {}
124169
config_data["EMBEDDING_MODEL_NAME"] = chosen_directory
125170
config_file_path.write_text(yaml.dump(config_data), encoding='utf-8')
126-
print(f"Selected directory: {chosen_directory}")
171+
def _get_main_window():
172+
for widget in QApplication.topLevelWidgets():
173+
if hasattr(widget, 'databases_tab'):
174+
return widget
175+
return None

src/create_symlinks.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ def create_symlinks_parallel(source: Union[str, Path, List[str], List[Path]],
1717
target_dir: Union[str, Path] = "Docs_for_DB") -> Tuple[int, list]:
1818
"""
1919
Create symbolic links using multiprocessing if the number of files exceeds 500.
20-
20+
2121
Args:
2222
source: Can be either:
2323
- str or Path: Path to the source directory
2424
- List[str] or List[Path]: List of file paths
2525
target_dir: Path to the directory to store symlinks (default: 'Docs_for_DB')
26-
26+
2727
Returns:
2828
tuple: (number of links created, list of errors)
2929
"""
@@ -33,14 +33,12 @@ def create_symlinks_parallel(source: Union[str, Path, List[str], List[Path]],
3333
return 0, []
3434

3535
try:
36-
# Handle directory input
3736
if isinstance(source, (str, Path)) and not isinstance(source, list):
3837
source_dir = Path(source)
3938
if not source_dir.exists():
4039
raise ValueError(f"Source directory does not exist: {source_dir}")
4140
files = [(str(p), str(target_dir)) for p in source_dir.iterdir() if p.is_file()]
4241

43-
# Handle list of files input
4442
elif isinstance(source, list):
4543
files = [(str(Path(p)), str(target_dir)) for p in source]
4644

@@ -72,8 +70,8 @@ def create_symlinks_parallel(source: Union[str, Path, List[str], List[Path]],
7270
print("\nErrors occurred:")
7371
for error in errors:
7472
print(error)
75-
73+
7674
return count, errors
77-
75+
7876
except Exception as e:
7977
raise RuntimeError(f"An error occurred: {str(e)}")

src/database_interactions.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def create(self):
5353
prepared_encode_kwargs = self.prepare_encode_kwargs()
5454
return HuggingFaceEmbeddings(
5555
model_name=self.model_name,
56-
show_progress=not self.is_query, # only show progress for database creation
56+
show_progress=not self.is_query,
5757
model_kwargs=prepared_kwargs,
5858
encode_kwargs=prepared_encode_kwargs
5959
)
@@ -129,13 +129,6 @@ def prepare_kwargs(self):
129129
logging.debug(f"is_cuda: {is_cuda}")
130130
logging.debug(f"use_xformers: {use_xformers}")
131131

132-
# Add this tokenizer configuration to fix the error
133-
stella_kwargs["tokenizer_kwargs"] = {
134-
"max_length": 8192,
135-
"padding": True,
136-
"truncation": True
137-
}
138-
139132
stella_kwargs["config_kwargs"] = {
140133
"use_memory_efficient_attention": use_xformers,
141134
"unpad_inputs": use_xformers,
@@ -152,33 +145,31 @@ def prepare_kwargs(self):
152145

153146
class AlibabaEmbedding(BaseEmbeddingModel):
154147
def prepare_kwargs(self):
155-
logging.debug("Starting AlibabaEmbedding prepare_kwargs.")
156148
ali_kwargs = deepcopy(self.model_kwargs)
157-
logging.debug(f"Original model_kwargs: {self.model_kwargs}")
158-
159-
compute_device = self.model_kwargs.get("device", "").lower()
149+
compute_device = ali_kwargs.get("device", "").lower()
160150
is_cuda = compute_device == "cuda"
161151
use_xformers = is_cuda and supports_flash_attention()
162-
logging.debug(f"Device: {compute_device}")
163-
logging.debug(f"is_cuda: {is_cuda}")
164-
logging.debug(f"use_xformers: {use_xformers}")
165-
166152
ali_kwargs["tokenizer_kwargs"] = {
167-
"max_length": 8192,
168-
"padding": True,
169-
"truncation": True
153+
"padding": "longest",
154+
"truncation": True,
155+
"max_length": 8192
170156
}
171-
172157
ali_kwargs["config_kwargs"] = {
173158
"use_memory_efficient_attention": use_xformers,
174159
"unpad_inputs": use_xformers,
175160
"attn_implementation": "eager" if use_xformers else "sdpa"
176161
}
177-
logging.debug(f"Set 'config_kwargs': {ali_kwargs['config_kwargs']}")
178-
179-
logging.debug(f"Final ali_kwargs: {ali_kwargs}")
180162
return ali_kwargs
181163

164+
def prepare_encode_kwargs(self):
165+
encode_kwargs = super().prepare_encode_kwargs()
166+
encode_kwargs.update({
167+
"padding": True,
168+
"truncation": True,
169+
"max_length": 8192
170+
})
171+
return encode_kwargs
172+
182173

183174
def create_vector_db_in_process(database_name):
184175
create_vector_db = CreateVectorDB(database_name=database_name)
@@ -306,7 +297,6 @@ def create_database(self, texts, embeddings):
306297
all_ids = []
307298
chunk_counters = defaultdict(int)
308299

309-
# Process all texts and generate IDs
310300
for doc in texts:
311301
file_hash = doc.metadata.get('hash')
312302
chunk_counters[file_hash] += 1

src/extract_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# works with langchain 0.3+
1+
# extract_metadata.py
22

33
import os
44
import datetime

0 commit comments

Comments
 (0)