Merge pull request #3 from yixin0829/codex/diagnose-nonetype-application-thread-error

yixin0829 · web-flow · commit c09d469343ba · 2025-08-01T18:40:03.000-04:00
Fix stop app race condition
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,116 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Development Commands
+
+### Installation and Setup
+```bash
+# Install dependencies using uv
+uv sync
+
+# Run GUI application
+uv run python main_gui.py
+
+# Run console application
+uv run python main_console.py
+```
+
+### Building and Packaging
+```bash
+# Build Windows executable (GUI version)
+.\build.bat
+
+# Manual build with PyInstaller
+uv run pyinstaller push_to_talk.spec
+
+# For console executable: modify push_to_talk.spec to use main_console.py and set console=True
+```
+
+### Code Quality
+```bash
+# Format code with ruff
+uv run ruff format
+
+# Lint code with ruff
+uv run ruff check
+
+# Fix linting issues automatically
+uv run ruff check --fix
+```
+
+## Architecture Overview
+
+This is a Windows push-to-talk speech-to-text application with dual interfaces (GUI and console) that uses OpenAI's API for transcription and text refinement.
+
+### Core Components
+- **PushToTalkApp** (`src/push_to_talk.py`): Main orchestrator with configuration management and dynamic updates
+- **ConfigurationGUI** (`src/config_gui.py`): Persistent GUI interface with real-time status management
+- **AudioRecorder** (`src/audio_recorder.py`): PyAudio-based recording with configurable audio settings
+- **Transcriber** (`src/transcription.py`): OpenAI Whisper integration for speech-to-text
+- **TextRefiner** (`src/text_refiner.py`): GPT-based text improvement and correction
+- **TextInserter** (`src/text_inserter.py`): Windows text insertion via clipboard or sendkeys
+- **HotkeyService** (`src/hotkey_service.py`): Global hotkey detection requiring admin privileges
+
+### Entry Points
+- **main_gui.py**: GUI application with persistent configuration interface
+- **main_console.py**: Console-based application for command-line usage
+- **Built executable**: `dist/PushToTalk.exe` (GUI version, no console window)
+
+### Data Flow
+1. User presses hotkey → Audio recording starts with optional audio feedback
+2. User releases hotkey → Recording stops, audio saved to temp file
+3. Audio sent to OpenAI Whisper for transcription
+4. Raw text optionally refined using GPT models
+5. Refined text inserted into active window via Windows API
+
+### Configuration System
+- **File-based**: `push_to_talk_config.json` for persistent settings
+- **Environment**: `OPENAI_API_KEY` environment variable support
+- **GUI**: Real-time configuration with validation and testing
+- **Dynamic updates**: Application can update configuration without restart
+
+## Key Technical Details
+
+### Windows-Specific Requirements
+- **Administrator privileges**: Required for global hotkey detection
+- **pywin32**: Used for Windows text insertion and audio feedback
+- **Audio permissions**: Microphone access required for recording
+
+### Audio Processing
+- **Sample rates**: 8kHz-44.1kHz supported, 16kHz recommended for Whisper
+- **Formats**: WAV files for temporary audio storage
+- **Feedback**: Optional audio cues using Windows winsound module
+
+### Text Insertion Methods
+- **sendkeys**: Character-by-character simulation, better for special characters
+- **clipboard**: Faster method using Ctrl+V, may not work in all applications
+
+### Configuration Parameters
+Key settings in `PushToTalkConfig` class:
+- `openai_api_key`: Required for transcription and refinement
+- `stt_model`: "gpt-4o-transcribe" or "whisper-1"
+- `refinement_model`: "gpt-4.1-nano", "gpt-4o-mini", or "gpt-4o"
+- `hotkey`/`toggle_hotkey`: Customizable key combinations
+- `insertion_method`: "sendkeys" or "clipboard"
+- `enable_text_refinement`: Toggle GPT text improvement
+
+## Development Workflow
+
+### Making Changes
+1. Test changes with both GUI and console applications
+2. Ensure admin privileges are handled correctly for hotkey functionality
+3. Validate OpenAI API integration with proper error handling
+4. Test text insertion in various Windows applications
+
+### Building for Distribution
+1. Use `build.bat` for standard GUI executable
+2. Modify `push_to_talk.spec` for console builds or customization
+3. Test executable on clean Windows system without Python installed
+4. Consider antivirus false positives with PyInstaller executables
+
+### Configuration Testing
+- Use GUI "Test Configuration" button for API validation
+- Test hotkey combinations don't conflict with system shortcuts
+- Verify text insertion works in target applications (text editors, browsers, etc.)
+- Check audio settings produce clear recordings for transcription accuracy
diff --git a/build.bat b/build.bat
@@ -2,9 +2,9 @@
 echo Building PushToTalk GUI Windows Executable...
 echo.
 
-REM Clean previous builds
-if exist "dist" rmdir /s /q "dist"
-if exist "build" rmdir /s /q "build"
+REM Clean previous .exe and .zip files
+if exist "dist\PushToTalk.exe" del /f /q "dist\PushToTalk.exe"
+if exist "dist\PushToTalk.zip" del /f /q "dist\PushToTalk.zip"
 
 REM Build the executable
 echo Building GUI application with PyInstaller...
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,8 +8,8 @@ dependencies = [
     "keyboard>=0.13.5",
     "openai>=1.97.1",
     "pyaudio>=0.2.14",
-    "pywin32>=309",
-    "websocket-client>=1.8.0",
+    "pyautogui>=0.9.54",
+    "pyperclip>=1.9.0",
 ]
 
 [dependency-groups]
diff --git a/src/config_gui.py b/src/config_gui.py
@@ -622,7 +622,7 @@ def _run_application_thread(self):
                 self.app_instance.start(setup_signals=False)
 
                 # Keep running until stopped
-                while self.app_instance.is_running:
+                while self.app_instance and self.app_instance.is_running:
                     import time
 
                     time.sleep(0.1)
@@ -658,6 +658,10 @@ def _stop_application(self):
             )
             self._update_status_display()
 
+            # Wait for the background thread to finish before clearing references
+            if self.app_thread and self.app_thread.is_alive():
+                self.app_thread.join(timeout=1)
+
             self.app_instance = None
             self.app_thread = None
 
diff --git a/src/text_inserter.py b/src/text_inserter.py
@@ -1,10 +1,10 @@
 import time
 import logging
+import sys
 from typing import Optional
-import win32gui
-import win32con
-import win32clipboard
-import win32api
+
+import pyautogui
+import pyperclip
 
 logger = logging.getLogger(__name__)
 
@@ -48,38 +48,21 @@ def insert_text(self, text: str, method: str = "clipboard") -> bool:
             return False
 
     def _insert_via_clipboard(self, text: str) -> bool:
-        """
-        Insert text by copying to clipboard and pasting.
-        This is generally more reliable for longer texts.
-        """
-        try:
-            # Get the current active window
-            active_window = win32gui.GetForegroundWindow()
-            if not active_window:
-                logger.error("No active window found")
-                return False
-
-            # Save current clipboard content
-            original_clipboard = self._get_clipboard_text()
+        """Insert text by copying to clipboard and pasting."""
 
-            # Copy text to clipboard
-            self._set_clipboard_text(text)
+        try:
+            original_clipboard = pyperclip.paste()
+            pyperclip.copy(text)
 
-            # Small delay to ensure clipboard is set
             time.sleep(0.05)
 
-            # Send Ctrl+V to paste
-            win32api.keybd_event(win32con.VK_CONTROL, 0, 0, 0)
-            win32api.keybd_event(ord("V"), 0, 0, 0)
-            win32api.keybd_event(ord("V"), 0, win32con.KEYEVENTF_KEYUP, 0)
-            win32api.keybd_event(win32con.VK_CONTROL, 0, win32con.KEYEVENTF_KEYUP, 0)
+            paste_keys = ["command", "v"] if sys.platform == "darwin" else ["ctrl", "v"]
+            pyautogui.hotkey(*paste_keys)
 
-            # Small delay before restoring clipboard
             time.sleep(0.1)
 
-            # Restore original clipboard content
-            if original_clipboard is not None:
-                self._set_clipboard_text(original_clipboard)
+            if original_clipboard:
+                pyperclip.copy(original_clipboard)
 
             logger.info(f"Text inserted via clipboard: {len(text)} characters")
             return True
@@ -89,54 +72,10 @@ def _insert_via_clipboard(self, text: str) -> bool:
             return False
 
     def _insert_via_sendkeys(self, text: str) -> bool:
-        """
-        Insert text by simulating individual keystrokes.
-        Better for short texts but slower for longer ones.
-        """
-        try:
-            active_window = win32gui.GetForegroundWindow()
-            if not active_window:
-                logger.error("No active window found")
-                return False
-
-            # Send each character individually
-            for char in text:
-                if char == "\n":
-                    # Send Enter for newlines
-                    win32api.keybd_event(win32con.VK_RETURN, 0, 0, 0)
-                    win32api.keybd_event(
-                        win32con.VK_RETURN, 0, win32con.KEYEVENTF_KEYUP, 0
-                    )
-                elif char == "\t":
-                    # Send Tab for tabs
-                    win32api.keybd_event(win32con.VK_TAB, 0, 0, 0)
-                    win32api.keybd_event(
-                        win32con.VK_TAB, 0, win32con.KEYEVENTF_KEYUP, 0
-                    )
-                else:
-                    # Convert character to virtual key code
-                    vk_code = win32api.VkKeyScan(char)
-                    if vk_code != -1:
-                        # Handle shift modifier for uppercase letters and symbols
-                        if vk_code & 0x100:  # Shift key needed
-                            win32api.keybd_event(win32con.VK_SHIFT, 0, 0, 0)
-                            win32api.keybd_event(vk_code & 0xFF, 0, 0, 0)
-                            win32api.keybd_event(
-                                vk_code & 0xFF, 0, win32con.KEYEVENTF_KEYUP, 0
-                            )
-                            win32api.keybd_event(
-                                win32con.VK_SHIFT, 0, win32con.KEYEVENTF_KEYUP, 0
-                            )
-                        else:
-                            win32api.keybd_event(vk_code & 0xFF, 0, 0, 0)
-                            win32api.keybd_event(
-                                vk_code & 0xFF, 0, win32con.KEYEVENTF_KEYUP, 0
-                            )
-
-                # Small delay between keystrokes
-                if self.insertion_delay > 0:
-                    time.sleep(self.insertion_delay)
+        """Insert text by simulating individual keystrokes."""
 
+        try:
+            pyautogui.write(text, interval=self.insertion_delay)
             logger.info(f"Text inserted via sendkeys: {len(text)} characters")
             return True
 
@@ -147,23 +86,13 @@ def _insert_via_sendkeys(self, text: str) -> bool:
     def _get_clipboard_text(self) -> Optional[str]:
         """Get current clipboard text content."""
         try:
-            win32clipboard.OpenClipboard()
-            data = win32clipboard.GetClipboardData(win32con.CF_TEXT)
-            win32clipboard.CloseClipboard()
-            return data.decode("utf-8") if isinstance(data, bytes) else data
+            return pyperclip.paste()
         except Exception:
-            try:
-                win32clipboard.CloseClipboard()
-            except Exception:
-                pass
             return None
 
-    def _set_clipboard_text(self, text: str):
+    def _set_clipboard_text(self, text: str) -> None:
         """Set clipboard text content."""
-        win32clipboard.OpenClipboard()
-        win32clipboard.EmptyClipboard()
-        win32clipboard.SetClipboardText(text)
-        win32clipboard.CloseClipboard()
+        pyperclip.copy(text)
 
     def get_active_window_title(self) -> Optional[str]:
         """
@@ -173,10 +102,9 @@ def get_active_window_title(self) -> Optional[str]:
             Window title or None if no active window
         """
         try:
-            active_window = win32gui.GetForegroundWindow()
-            if active_window:
-                window_title = win32gui.GetWindowText(active_window)
-                return window_title if window_title else None
+            window = pyautogui.getActiveWindow()
+            if window:
+                return window.title if window.title else None
             return None
         except Exception as e:
             logger.error(f"Failed to get active window title: {e}")
diff --git a/uv.lock b/uv.lock